Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/reference/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ vllm-mlx serve <model> [options]
| `--paged-cache-block-size` | Tokens per cache block | 64 |
| `--max-cache-blocks` | Maximum cache blocks | 1000 |
| `--max-num-seqs` | Max concurrent sequences | 256 |
| `--gpu-memory-utilization` | Fraction of device memory for Metal allocation limit (0.0-1.0) | 0.90 |
| `--default-temperature` | Default temperature when not specified in request | None |
| `--default-top-p` | Default top_p when not specified in request | None |
| `--reasoning-parser` | Parser for reasoning models (`qwen3`, `deepseek_r1`) | None |
Expand Down Expand Up @@ -88,6 +89,11 @@ vllm-mlx serve mlx-community/granite-4.0-tiny-preview-4bit \
# With API key authentication
vllm-mlx serve mlx-community/Llama-3.2-3B-Instruct-4bit --api-key your-secret-key

# Large models (200GB+) — raise memory limit to avoid cache thrashing
vllm-mlx serve mlx-community/Qwen3.5-397B-A17B-nvfp4 \
--continuous-batching \
--gpu-memory-utilization 0.95

# Production setup with security options
vllm-mlx serve mlx-community/Qwen3-4B-4bit \
--api-key your-secret-key \
Expand Down
16 changes: 16 additions & 0 deletions vllm_mlx/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ def serve_command(args):
print("Example: --enable-auto-tool-choice --tool-call-parser mistral")
sys.exit(1)

# Validate gpu-memory-utilization range
if not (0.0 < args.gpu_memory_utilization <= 1.0):
print(
"Error: --gpu-memory-utilization must be between 0.0 (exclusive) and 1.0 (inclusive)"
)
sys.exit(1)

# Configure server security settings
server._api_key = args.api_key
server._default_timeout = args.timeout
Expand Down Expand Up @@ -204,6 +211,7 @@ def serve_command(args):
specprefill_threshold=args.specprefill_threshold,
specprefill_keep_pct=args.specprefill_keep_pct,
specprefill_draft_model=args.specprefill_draft_model,
gpu_memory_utilization=args.gpu_memory_utilization,
)

# Start server
Expand Down Expand Up @@ -704,6 +712,14 @@ def main():
action="store_true",
help="Enable continuous batching for multiple concurrent users (slower for single user)",
)
serve_parser.add_argument(
"--gpu-memory-utilization",
type=float,
default=0.90,
help="Fraction of device memory for Metal allocation limit and emergency "
"cache clear threshold (0.0-1.0, default: 0.90). Increase to 0.95 for "
"large models (200GB+) that need more memory headroom.",
)
# Paged cache options (experimental)
serve_parser.add_argument(
"--use-paged-cache",
Expand Down
10 changes: 8 additions & 2 deletions vllm_mlx/engine/batched.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ def __init__(
scheduler_config: Any | None = None,
stream_interval: int = 1,
force_mllm: bool = False,
gpu_memory_utilization: float = 0.90,
):
"""
Initialize the batched engine.
Expand All @@ -147,11 +148,14 @@ def __init__(
scheduler_config: Optional scheduler configuration
stream_interval: Tokens to batch before streaming (1=every token)
force_mllm: Force loading as MLLM even if not auto-detected
gpu_memory_utilization: Fraction of device memory for Metal allocation
limit and emergency threshold (0.0-1.0, default 0.90)
"""
self._model_name = model_name
self._trust_remote_code = trust_remote_code
self._scheduler_config = scheduler_config
self._stream_interval = stream_interval
self._gpu_memory_utilization = gpu_memory_utilization
self._is_mllm = force_mllm or is_mllm_model(model_name)

self._model = None
Expand Down Expand Up @@ -283,13 +287,14 @@ async def _start_llm(self) -> None:
device_info.get("memory_size", 0),
)
if max_recommended > 0:
soft_limit = int(max_recommended * 0.90)
soft_limit = int(max_recommended * self._gpu_memory_utilization)
mx.set_memory_limit(soft_limit)
mx.set_cache_limit(32 * 1024 * 1024 * 1024) # 32GB
pct = self._gpu_memory_utilization * 100
logger.info(
f"Metal memory limits set: "
f"allocation_limit={soft_limit / 1e9:.1f}GB "
f"(90% of {max_recommended / 1e9:.1f}GB), "
f"({pct:.0f}% of {max_recommended / 1e9:.1f}GB), "
f"cache_limit=32GB"
)
except Exception as e:
Expand All @@ -301,6 +306,7 @@ async def _start_llm(self) -> None:
model_name=self._model_name,
scheduler_config=scheduler_config,
stream_interval=self._stream_interval,
gpu_memory_utilization=self._gpu_memory_utilization,
)

# Create async engine
Expand Down
18 changes: 7 additions & 11 deletions vllm_mlx/engine_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class EngineConfig:
scheduler_config: Optional[SchedulerConfig] = None
step_interval: float = 0.001 # 1ms between steps
stream_interval: int = 1 # Tokens to batch before streaming (1=every token)
gpu_memory_utilization: float = 0.90 # Fraction of device memory for allocation


class EngineCore:
Expand Down Expand Up @@ -150,18 +151,13 @@ async def _engine_loop(self) -> None:
stream_interval = self.config.stream_interval
use_simple_streaming = stream_interval == 1

# Emergency memory pressure threshold — use 85% of Metal's
# max recommended working set so this scales with system RAM.
# Emergency memory pressure threshold — dynamic based on gpu_memory_utilization
# Uses a 5% gap above the soft limit (capped at 99%) to allow temporary spikes.
_gpu_mem_util = self.config.gpu_memory_utilization
try:
_device_info = mx.device_info()
_max_recommended = _device_info.get(
"max_recommended_working_set_size",
_device_info.get("memory_size", 0),
)
_memory_pressure_threshold = (
int(_max_recommended * 0.85)
if _max_recommended > 0
else 200 * 1024 * 1024 * 1024
_device_mem = mx.device_info().get("memory_size", 200 * 1024 * 1024 * 1024)
_memory_pressure_threshold = int(
_device_mem * min(_gpu_mem_util + 0.05, 0.99)
)
except Exception:
_memory_pressure_threshold = 200 * 1024 * 1024 * 1024
Expand Down
4 changes: 4 additions & 0 deletions vllm_mlx/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,7 @@ def load_model(
specprefill_threshold: int = 8192,
specprefill_keep_pct: float = 0.3,
specprefill_draft_model: str = None,
gpu_memory_utilization: float = 0.90,
):
"""
Load a model (auto-detects MLLM vs LLM).
Expand All @@ -507,6 +508,8 @@ def load_model(
specprefill_threshold: Minimum suffix tokens to trigger SpecPrefill (default: 8192)
specprefill_keep_pct: Fraction of tokens to keep (default: 0.3)
specprefill_draft_model: Path to small draft model for SpecPrefill scoring
gpu_memory_utilization: Fraction of device memory for Metal allocation
limit and emergency threshold (0.0-1.0, default 0.90)
"""
global _engine, _model_name, _model_path, _default_max_tokens, _tool_parser_instance

Expand All @@ -526,6 +529,7 @@ def load_model(
scheduler_config=scheduler_config,
stream_interval=stream_interval,
force_mllm=force_mllm,
gpu_memory_utilization=gpu_memory_utilization,
)
# BatchedEngine will be started in lifespan (uvicorn's event loop)
# Just log for now
Expand Down
Loading