diff --git a/vllm_mlx/engine_core.py b/vllm_mlx/engine_core.py index aaa0ccc47..fa47d3c3d 100644 --- a/vllm_mlx/engine_core.py +++ b/vllm_mlx/engine_core.py @@ -150,8 +150,17 @@ async def _engine_loop(self) -> None: stream_interval = self.config.stream_interval use_simple_streaming = stream_interval == 1 - # Emergency memory pressure threshold (200GB) - _memory_pressure_threshold = 200 * 1024 * 1024 * 1024 + # Emergency memory pressure threshold — use 85% of Metal's + # max recommended working set so this scales with system RAM. + try: + _device_info = mx.device_info() + _max_recommended = _device_info.get( + "max_recommended_working_set_size", + _device_info.get("memory_size", 0), + ) + _memory_pressure_threshold = int(_max_recommended * 0.85) if _max_recommended > 0 else 200 * 1024 * 1024 * 1024 + except Exception: + _memory_pressure_threshold = 200 * 1024 * 1024 * 1024 _memory_check_interval = 64 while self._running: diff --git a/vllm_mlx/utils/tokenizer.py b/vllm_mlx/utils/tokenizer.py index 3c12e7ae2..d85b83689 100644 --- a/vllm_mlx/utils/tokenizer.py +++ b/vllm_mlx/utils/tokenizer.py @@ -57,8 +57,37 @@ def load_model_with_fallback(model_name: str, tokenizer_config: dict = None): if "TokenizersBackend" in str(e) or "Tokenizer class" in str(e): logger.warning(f"Standard tokenizer loading failed, using fallback: {e}") return _load_with_tokenizer_fallback(model_name) - else: - raise + # Fallback for vision-text models loaded as text-only (e.g., Qwen3.5). + # The weight files contain vision tower params that mlx-lm's text model + # doesn't define. Retry with strict=False to discard extra weights. + if "parameters not in model" in str(e): + logger.warning( + "Model has extra weights (likely vision tower), " + "retrying with strict=False to load as text-only." + ) + return _load_strict_false(model_name, tokenizer_config) + raise + + +def _load_strict_false(model_name: str, tokenizer_config: dict = None): + """Load model with strict=False to discard extra weights (e.g., vision tower).""" + from mlx_lm.utils import load_model, load_tokenizer + + local_path = Path(model_name) + if local_path.is_dir(): + model_path = local_path + else: + from huggingface_hub import snapshot_download + + model_path = Path(snapshot_download(model_name)) + + model, config = load_model(model_path, strict=False) + tokenizer = load_tokenizer( + model_path, + tokenizer_config or {}, + eos_token_ids=config.get("eos_token_id", None), + ) + return model, tokenizer def _load_with_tokenizer_fallback(model_name: str):