wallaroo.deployment_config

class DeploymentConfig(typing.Dict):

class DeploymentConfigBuilder:

DeploymentConfigBuilder(workspace_id: Optional[int] = None)

def image(self, image: str) -> DeploymentConfigBuilder:

def replica_count(self, count: int) -> DeploymentConfigBuilder:

def replica_autoscale_min_max(self, maximum: int, minimum: int = 0):

Configures the minimum and maximum for autoscaling

def autoscale_cpu_utilization(self, cpu_utilization_percentage: int):

Sets the average CPU metric to scale on in a percentage

def disable_autoscale(self):

Disables autoscaling in the deployment configuration

def cpus( self, core_count: int) -> DeploymentConfigBuilder:

def deployment_label(self, label: str) -> DeploymentConfigBuilder:

def gpus( self, gpu_count: int) -> DeploymentConfigBuilder:

def memory( self, memory_spec: str) -> DeploymentConfigBuilder:

def lb_cpus( self, core_count: int) -> DeploymentConfigBuilder:

def lb_memory( self, memory_spec: int) -> DeploymentConfigBuilder:

def arch( self, arch: Optional[wallaroo.engine_config.Architecture] = None) -> DeploymentConfigBuilder:

def accel( self, accel: Optional[wallaroo.engine_config.Acceleration] = None) -> DeploymentConfigBuilder:

def python_load_timeout_secs( self, timeout_secs: int) -> DeploymentConfigBuilder:

def sidekick_gpus( self, model_version: wallaroo.model_version.ModelVersion, gpu_count: int) -> DeploymentConfigBuilder:

Sets the number of GPUs to be used for the model's sidekick container. Only affects image-based models (e.g. MLFlow models) in a deployment.

Parameters

ModelVersion model_version: The sidekick model to configure.
int core_count: Number of GPUs to use in this sidekick.

Returns

def sidekick_cpus( self, model_version: wallaroo.model_version.ModelVersion, core_count: int) -> DeploymentConfigBuilder:

Sets the number of CPUs to be used for the model's sidekick container. Only affects image-based models (e.g. MLFlow models) in a deployment.

Parameters

ModelVersion model_version: The sidekick model to configure.
int core_count: Number of CPU cores to use in this sidekick.

Returns

def sidekick_memory( self, model_version: wallaroo.model_version.ModelVersion, memory_spec: str) -> DeploymentConfigBuilder:

Sets the memory to be used for the model's sidekick container. Only affects image-based models (e.g. MLFlow models) in a deployment.

Parameters

ModelVersion model_version: The sidekick model to configure.
str memory_spec: Specification of amount of memory (e.g., "2Gi", "500Mi") to use in this sidekick.

Returns

def sidekick_env( self, model_version: wallaroo.model_version.ModelVersion, environment: Dict[str, str]) -> DeploymentConfigBuilder:

Sets the environment variables to be set for the model's sidekick container. Only affects image-based models (e.g. MLFlow models) in a deployment.

Parameters

ModelVersion model_version: The sidekick model to configure.
Dict[str, str] environment: Dictionary of environment variables names and their corresponding values to be set in the sidekick container.

Returns

def sidekick_arch( self, model_version: wallaroo.model_version.ModelVersion, arch: Optional[wallaroo.engine_config.Architecture] = None) -> DeploymentConfigBuilder:

Sets the machine architecture for the model's sidekick container. Only affects image-based models (e.g. MLFlow models) in a deployment.

Parameters

model_version: ModelVersion: The sidekick model to configure.
arch: Optional[Architecture]: Machine architecture for this sidekick.

Returns

def sidekick_accel( self, model_version: wallaroo.model_version.ModelVersion, accel: Union[wallaroo.engine_config.Acceleration, wallaroo.engine_config.AccelerationWithConfig, NoneType] = None) -> DeploymentConfigBuilder:

Sets the acceleration option for the model's sidekick container. Only affects image-based models (e.g. MLFlow models) in a deployment.

Parameters

model_version: ModelVersion: The sidekick model to configure.
accel: Optional[Union[Acceleration, AccelerationWithConfig]]: Acceleration option for this sidekick.

Returns

def scale_up_queue_depth( self, queue_depth: int) -> DeploymentConfigBuilder:

Configure the scale_up_queue_depth threshold as an autoscaling trigger.

This method sets a queue depth threshold above which all pipeline components (including the engine and LLM sidekicks) will incrementally scale up.

The scale_up_queue_depth is calculated as: (number of requests in queue + requests being processed) / number of available replicas over a scaling window.

Notes: - This parameter must be configured to activate queue-based autoscaling. - No default value is provided. - When configured, scale_up_queue_depth overrides the default autoscaling trigger (cpu_utilization). - The setting applies to all components of the pipeline. - When set, scale_down_queue_depth is automatically set to 1 if not already configured.

Parameters

queue_depth (int): The threshold value for queue-based autoscaling.

Returns

def scale_down_queue_depth( self, queue_depth: Optional[int] = None) -> DeploymentConfigBuilder:

Configure the scale_down_queue_depth threshold as an autoscaling trigger.

This method sets a queue depth threshold below which all pipeline components (including the engine and LLM sidekicks) will incrementally scale down.

The scale_down_queue_depth is calculated as: (number of requests in queue + requests being processed) / number of available replicas over a scaling window.

Notes: - This parameter is optional and defaults to 1 if not set. - scale_down_queue_depth is only applicable when scale_up_queue_depth is configured. - The setting applies to all components of the pipeline. - This threshold helps prevent unnecessary scaling down when the workload is still significant but below the scale-up threshold.

Parameters

queue_depth (int): The threshold value for queue-based downscaling.

Returns

Raises

ValueError: If scale_up_queue_depth is not configured.

def autoscaling_window( self, window_seconds: Optional[int] = None) -> DeploymentConfigBuilder:

Configure the autoscaling window for incrementally scaling up/down pipeline components.

This method sets the time window over which the autoscaling metrics are evaluated for making scaling decisions. It applies to all components of the pipeline, including the engine and LLM sidekicks.

Notes: - The default value is 300 seconds if not specified. - This setting is only applicable when scale_up_queue_depth is configured. - The autoscaling window helps smooth out short-term fluctuations in workload and prevents rapid scaling events.

Parameters

window_seconds: Optional[int], the duration of the autoscaling window in seconds. If None, the default value of 300 seconds is used.

Returns

Raises

ValueError: If scale_up_queue_depth is not configured.

@staticmethod

def convert_acceleration( accel_obj: Union[str, dict]) -> Union[wallaroo.engine_config.Acceleration, wallaroo.engine_config.AccelerationWithConfig]:

Convert the acceleration string to an Acceleration or AccelerationWithConfig object.

Parameters

accel_str: str, the acceleration string to check

Returns

@staticmethod

def convert_acceleration_to_string(accel_obj: Union[str, dict]) -> str:

Convert an Acceleration object to a valid string for deployment.

Parameters

accel_obj: Union[str, dict], the acceleration to be converted return: str, the stringified acceleration

@staticmethod

def convert_acceleration_dict_to_qaic(accel_obj: dict) -> wallaroo.engine_config.QaicWithConfig:

Convert an acceleration dict to a QaicWithConfig object.

def build(self) -> DeploymentConfig: