Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions vllm_mlx/engine_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,17 @@ async def _engine_loop(self) -> None:
stream_interval = self.config.stream_interval
use_simple_streaming = stream_interval == 1

# Emergency memory pressure threshold (200GB)
_memory_pressure_threshold = 200 * 1024 * 1024 * 1024
# Emergency memory pressure threshold — use 85% of Metal's
# max recommended working set so this scales with system RAM.
try:
_device_info = mx.device_info()
_max_recommended = _device_info.get(
"max_recommended_working_set_size",
_device_info.get("memory_size", 0),
)
_memory_pressure_threshold = int(_max_recommended * 0.85) if _max_recommended > 0 else 200 * 1024 * 1024 * 1024
except Exception:
_memory_pressure_threshold = 200 * 1024 * 1024 * 1024
_memory_check_interval = 64

while self._running:
Expand Down
33 changes: 31 additions & 2 deletions vllm_mlx/utils/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,37 @@ def load_model_with_fallback(model_name: str, tokenizer_config: dict = None):
if "TokenizersBackend" in str(e) or "Tokenizer class" in str(e):
logger.warning(f"Standard tokenizer loading failed, using fallback: {e}")
return _load_with_tokenizer_fallback(model_name)
else:
raise
# Fallback for vision-text models loaded as text-only (e.g., Qwen3.5).
# The weight files contain vision tower params that mlx-lm's text model
# doesn't define. Retry with strict=False to discard extra weights.
if "parameters not in model" in str(e):
logger.warning(
"Model has extra weights (likely vision tower), "
"retrying with strict=False to load as text-only."
)
return _load_strict_false(model_name, tokenizer_config)
raise


def _load_strict_false(model_name: str, tokenizer_config: dict = None):
"""Load model with strict=False to discard extra weights (e.g., vision tower)."""
from mlx_lm.utils import load_model, load_tokenizer

local_path = Path(model_name)
if local_path.is_dir():
model_path = local_path
else:
from huggingface_hub import snapshot_download

model_path = Path(snapshot_download(model_name))

model, config = load_model(model_path, strict=False)
tokenizer = load_tokenizer(
model_path,
tokenizer_config or {},
eos_token_ids=config.get("eos_token_id", None),
)
return model, tokenizer


def _load_with_tokenizer_fallback(model_name: str):
Expand Down