From 1ac9bd6a4144e5f23018f1d674300de04b6d590b Mon Sep 17 00:00:00 2001 From: Jan Hilgard Date: Mon, 23 Feb 2026 23:50:47 +0100 Subject: [PATCH] feat: Add --gpu-memory-utilization for configurable memory limits Add a single CLI flag to control both the Metal soft allocation limit (mx.set_memory_limit) and the emergency cache clear threshold in the engine loop. Default 0.90 preserves existing behavior. For large models (200GB+), the previous hardcoded 200GB emergency threshold and fixed 90% soft limit caused excessive cache clearing, resulting in ~3.5x slowdown. With --gpu-memory-utilization 0.95 both limits scale to the actual device memory, eliminating the thrashing. The emergency threshold is always 5% above the soft limit (capped at 99%) to give MLX headroom for temporary allocations. Co-Authored-By: Claude Opus 4.6 --- docs/reference/cli.md | 6 ++++++ vllm_mlx/cli.py | 16 ++++++++++++++++ vllm_mlx/engine/batched.py | 10 ++++++++-- vllm_mlx/engine_core.py | 18 +++++++----------- vllm_mlx/server.py | 4 ++++ 5 files changed, 41 insertions(+), 13 deletions(-) diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 2ba8b75e1..ac134ad0b 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -38,6 +38,7 @@ vllm-mlx serve [options] | `--paged-cache-block-size` | Tokens per cache block | 64 | | `--max-cache-blocks` | Maximum cache blocks | 1000 | | `--max-num-seqs` | Max concurrent sequences | 256 | +| `--gpu-memory-utilization` | Fraction of device memory for Metal allocation limit (0.0-1.0) | 0.90 | | `--default-temperature` | Default temperature when not specified in request | None | | `--default-top-p` | Default top_p when not specified in request | None | | `--reasoning-parser` | Parser for reasoning models (`qwen3`, `deepseek_r1`) | None | @@ -88,6 +89,11 @@ vllm-mlx serve mlx-community/granite-4.0-tiny-preview-4bit \ # With API key authentication vllm-mlx serve mlx-community/Llama-3.2-3B-Instruct-4bit --api-key your-secret-key +# Large models (200GB+) — raise memory limit to avoid cache thrashing +vllm-mlx serve mlx-community/Qwen3.5-397B-A17B-nvfp4 \ + --continuous-batching \ + --gpu-memory-utilization 0.95 + # Production setup with security options vllm-mlx serve mlx-community/Qwen3-4B-4bit \ --api-key your-secret-key \ diff --git a/vllm_mlx/cli.py b/vllm_mlx/cli.py index 8a90bc9be..0649f40da 100644 --- a/vllm_mlx/cli.py +++ b/vllm_mlx/cli.py @@ -37,6 +37,13 @@ def serve_command(args): print("Example: --enable-auto-tool-choice --tool-call-parser mistral") sys.exit(1) + # Validate gpu-memory-utilization range + if not (0.0 < args.gpu_memory_utilization <= 1.0): + print( + "Error: --gpu-memory-utilization must be between 0.0 (exclusive) and 1.0 (inclusive)" + ) + sys.exit(1) + # Configure server security settings server._api_key = args.api_key server._default_timeout = args.timeout @@ -204,6 +211,7 @@ def serve_command(args): specprefill_threshold=args.specprefill_threshold, specprefill_keep_pct=args.specprefill_keep_pct, specprefill_draft_model=args.specprefill_draft_model, + gpu_memory_utilization=args.gpu_memory_utilization, ) # Start server @@ -704,6 +712,14 @@ def main(): action="store_true", help="Enable continuous batching for multiple concurrent users (slower for single user)", ) + serve_parser.add_argument( + "--gpu-memory-utilization", + type=float, + default=0.90, + help="Fraction of device memory for Metal allocation limit and emergency " + "cache clear threshold (0.0-1.0, default: 0.90). Increase to 0.95 for " + "large models (200GB+) that need more memory headroom.", + ) # Paged cache options (experimental) serve_parser.add_argument( "--use-paged-cache", diff --git a/vllm_mlx/engine/batched.py b/vllm_mlx/engine/batched.py index ce33e628e..650584d02 100644 --- a/vllm_mlx/engine/batched.py +++ b/vllm_mlx/engine/batched.py @@ -137,6 +137,7 @@ def __init__( scheduler_config: Any | None = None, stream_interval: int = 1, force_mllm: bool = False, + gpu_memory_utilization: float = 0.90, ): """ Initialize the batched engine. @@ -147,11 +148,14 @@ def __init__( scheduler_config: Optional scheduler configuration stream_interval: Tokens to batch before streaming (1=every token) force_mllm: Force loading as MLLM even if not auto-detected + gpu_memory_utilization: Fraction of device memory for Metal allocation + limit and emergency threshold (0.0-1.0, default 0.90) """ self._model_name = model_name self._trust_remote_code = trust_remote_code self._scheduler_config = scheduler_config self._stream_interval = stream_interval + self._gpu_memory_utilization = gpu_memory_utilization self._is_mllm = force_mllm or is_mllm_model(model_name) self._model = None @@ -283,13 +287,14 @@ async def _start_llm(self) -> None: device_info.get("memory_size", 0), ) if max_recommended > 0: - soft_limit = int(max_recommended * 0.90) + soft_limit = int(max_recommended * self._gpu_memory_utilization) mx.set_memory_limit(soft_limit) mx.set_cache_limit(32 * 1024 * 1024 * 1024) # 32GB + pct = self._gpu_memory_utilization * 100 logger.info( f"Metal memory limits set: " f"allocation_limit={soft_limit / 1e9:.1f}GB " - f"(90% of {max_recommended / 1e9:.1f}GB), " + f"({pct:.0f}% of {max_recommended / 1e9:.1f}GB), " f"cache_limit=32GB" ) except Exception as e: @@ -301,6 +306,7 @@ async def _start_llm(self) -> None: model_name=self._model_name, scheduler_config=scheduler_config, stream_interval=self._stream_interval, + gpu_memory_utilization=self._gpu_memory_utilization, ) # Create async engine diff --git a/vllm_mlx/engine_core.py b/vllm_mlx/engine_core.py index a4a1e8ed6..d210c473c 100644 --- a/vllm_mlx/engine_core.py +++ b/vllm_mlx/engine_core.py @@ -36,6 +36,7 @@ class EngineConfig: scheduler_config: Optional[SchedulerConfig] = None step_interval: float = 0.001 # 1ms between steps stream_interval: int = 1 # Tokens to batch before streaming (1=every token) + gpu_memory_utilization: float = 0.90 # Fraction of device memory for allocation class EngineCore: @@ -150,18 +151,13 @@ async def _engine_loop(self) -> None: stream_interval = self.config.stream_interval use_simple_streaming = stream_interval == 1 - # Emergency memory pressure threshold — use 85% of Metal's - # max recommended working set so this scales with system RAM. + # Emergency memory pressure threshold — dynamic based on gpu_memory_utilization + # Uses a 5% gap above the soft limit (capped at 99%) to allow temporary spikes. + _gpu_mem_util = self.config.gpu_memory_utilization try: - _device_info = mx.device_info() - _max_recommended = _device_info.get( - "max_recommended_working_set_size", - _device_info.get("memory_size", 0), - ) - _memory_pressure_threshold = ( - int(_max_recommended * 0.85) - if _max_recommended > 0 - else 200 * 1024 * 1024 * 1024 + _device_mem = mx.device_info().get("memory_size", 200 * 1024 * 1024 * 1024) + _memory_pressure_threshold = int( + _device_mem * min(_gpu_mem_util + 0.05, 0.99) ) except Exception: _memory_pressure_threshold = 200 * 1024 * 1024 * 1024 diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py index cf3e66596..3a12f16c8 100644 --- a/vllm_mlx/server.py +++ b/vllm_mlx/server.py @@ -490,6 +490,7 @@ def load_model( specprefill_threshold: int = 8192, specprefill_keep_pct: float = 0.3, specprefill_draft_model: str = None, + gpu_memory_utilization: float = 0.90, ): """ Load a model (auto-detects MLLM vs LLM). @@ -507,6 +508,8 @@ def load_model( specprefill_threshold: Minimum suffix tokens to trigger SpecPrefill (default: 8192) specprefill_keep_pct: Fraction of tokens to keep (default: 0.3) specprefill_draft_model: Path to small draft model for SpecPrefill scoring + gpu_memory_utilization: Fraction of device memory for Metal allocation + limit and emergency threshold (0.0-1.0, default 0.90) """ global _engine, _model_name, _model_path, _default_max_tokens, _tool_parser_instance @@ -526,6 +529,7 @@ def load_model( scheduler_config=scheduler_config, stream_interval=stream_interval, force_mllm=force_mllm, + gpu_memory_utilization=gpu_memory_utilization, ) # BatchedEngine will be started in lifespan (uvicorn's event loop) # Just log for now