wallaroo.engine_config


class Architecture(builtins.str, enum.Enum):

An Enum to represent the supported processor architecture.

X86 = <Architecture.X86: 'x86'>
ARM = <Architecture.ARM: 'arm'>
Power10 = <Architecture.Power10: 'power10'>
@classmethod
def default(cls) -> Architecture:
class Acceleration(builtins.str, enum.Enum):

An Enum to represent the supported acceleration options.

CUDA = <Acceleration.CUDA: 'cuda'>
Jetson = <Acceleration.Jetson: 'jetson'>
OpenVINO = <Acceleration.OpenVINO: 'openvino'>
QAIC = <Acceleration.QAIC: 'qaic'>
@classmethod
def default(cls) -> Acceleration:
def default_acceleration_with_config(self) -> AccelerationWithConfig:
def is_applicable(self, arch: Architecture) -> bool:
def requires_config(self) -> bool:
def with_config( self, config: QaicConfig) -> AccelerationWithConfig:

Create an acceleration with a config. Required only for the Qaic acceleration for now.

Parameters
  • config: QaicConfig The config to use for the acceleration.
Returns

AccelerationWithConfig The acceleration with the given config.

:raise ModelOptimizationConfigError: If the acceleration is not supported.

@runtime_checkable
class OpenapiAccelerationWithConfig(typing_extensions.Protocol):

Base class for protocol classes.

Protocol classes are defined as::

class Proto(Protocol):
    def meth(self) -> int:
        ...

Such classes are primarily used with static type checkers that recognize structural subtyping (static duck-typing), for example::

class C:
    def meth(self) -> int:
        return 0

def func(x: Proto) -> int:
    return x.meth()

func(C())  # Passes static type check

See PEP 544 for details. Protocol classes decorated with @typing.runtime_checkable act as simple-minded runtime protocols that check only the presence of given attributes, ignoring their type signatures. Protocol classes can be generic, they are defined as::

class GenProto(Protocol[T]):
    def meth(self) -> T:
        ...
OpenapiAccelerationWithConfig(*args, **kwargs)
def to_dict(self) -> Dict[str, Any]:
@classmethod
def from_dict( cls, data: Dict[str, Any]) -> OpenapiAccelerationWithConfig:
class AccelerationWithConfig(pydantic.main.BaseModel, abc.ABC):

A base class for all acceleration that require a config.

model_config = {'arbitrary_types_allowed': True, 'extra': 'forbid', 'protected_namespaces': ()}
config: pydantic.main.BaseModel
def is_applicable(self, arch: Architecture) -> bool:
def to_dict(self) -> Dict[str, Any]:

Convert the config to a dictionary.

model_fields = {'accel': FieldInfo(annotation=Acceleration, required=True), 'config': FieldInfo(annotation=BaseModel, required=True)}
model_computed_fields = {}
class QaicConfig(pydantic.main.BaseModel):

A config for the Qaic acceleration.

model_config = {'arbitrary_types_allowed': True, 'extra': 'forbid', 'use_enum_values': True, 'protected_namespaces': ()}
num_cores: Union[wallaroo.wallaroo_ml_ops_api_client.types.Unset, int]
num_devices: Union[wallaroo.wallaroo_ml_ops_api_client.types.Unset, int]
ctx_len: Union[wallaroo.wallaroo_ml_ops_api_client.types.Unset, int]
prefill_seq_len: Union[wallaroo.wallaroo_ml_ops_api_client.types.Unset, int]
full_batch_size: Union[wallaroo.wallaroo_ml_ops_api_client.types.Unset, int]
mxfp6_matmul: Union[wallaroo.wallaroo_ml_ops_api_client.types.Unset, bool]
mxint8_kv_cache: Union[wallaroo.wallaroo_ml_ops_api_client.types.Unset, bool]
aic_enable_depth_first: Union[wallaroo.wallaroo_ml_ops_api_client.types.Unset, bool]
model_fields = {'num_cores': FieldInfo(annotation=Union[Unset, int], required=False, default=16, description='Number of cores used to compile the model. Defaults to `16`.', validate_default=True, metadata=[Ge(ge=1)]), 'num_devices': FieldInfo(annotation=Union[Unset, int], required=False, default=1, description='Number of SoCs in a given card to compile the model for. Each card (e.g. AI100) has 4 SoCs. Defaults to `1`.', validate_default=True, metadata=[Ge(ge=1)]), 'ctx_len': FieldInfo(annotation=Union[Unset, int], required=False, default=128, description='Maximum context that the compiled model can remember. Defaults to `128`.', validate_default=True, metadata=[Ge(ge=1)]), 'prefill_seq_len': FieldInfo(annotation=Union[Unset, int], required=False, default=32, description='The length of the Prefill prompt. Defaults to `32`.', validate_default=True, metadata=[Ge(ge=1)]), 'full_batch_size': FieldInfo(annotation=Union[Unset, int], required=False, default=8, description='Maximum number of sequences per iteration. Set to enable continuous batching mode. Defaults to `None`.', metadata=[Ge(ge=1)]), 'mxfp6_matmul': FieldInfo(annotation=Union[Unset, bool], required=False, default=False, description='Enable compilation for MXFP6 precision. Defaults to `False`.', validate_default=True), 'mxint8_kv_cache': FieldInfo(annotation=Union[Unset, bool], required=False, default=False, description='Compress Present/Past KV to MXINT8. Defaults to `False`.', validate_default=True), 'aic_enable_depth_first': FieldInfo(annotation=Union[Unset, bool], required=False, default=False, description='Enables DFS with default memory size. Defaults to `False`.', validate_default=True)}
model_computed_fields = {}
class QaicWithConfig(AccelerationWithConfig):

A base class for all acceleration that require a config.

accel: Literal[<Acceleration.QAIC: 'qaic'>]
config: QaicConfig
model_config = {'arbitrary_types_allowed': True, 'extra': 'forbid', 'protected_namespaces': ()}
model_fields = {'accel': FieldInfo(annotation=Literal[<Acceleration.QAIC: 'qaic'>], required=False, default=<Acceleration.QAIC: 'qaic'>), 'config': FieldInfo(annotation=QaicConfig, required=False, default=QaicConfig(num_cores=16, num_devices=1, ctx_len=128, prefill_seq_len=32, full_batch_size=8, mxfp6_matmul=False, mxint8_kv_cache=False, aic_enable_depth_first=False))}
model_computed_fields = {}
class EngineConfig:

Wraps an engine config.

EngineConfig( cpus: int, gpus: Optional[int] = 0, inference_channel_size: Optional[int] = None, model_concurrency: Optional[int] = None, pipeline_config_directory: Optional[str] = None, model_config_directory: Optional[str] = None, model_directory: Optional[str] = None, audit_logging: bool = False, arch: Architecture = <Architecture.X86: 'x86'>, accel: Acceleration = <Acceleration._None: 'none'>)
def to_json(self) -> str:

Returns a json representation of this object

class InvalidAccelerationError(builtins.Exception):

Raised when the specified acceleration is incompatible with the given platform architecture.

class ModelOptimizationConfigError(builtins.Exception):

Raised when the specified model optimization configuration is not available.