waybarrios · Thump604 · Apr 11, 2026 · Apr 11, 2026
diff --git a/scripts/add_mtp_weights_qwen35.py b/scripts/add_mtp_weights_qwen35.py
diff --git a/vllm_mlx/api/models.py b/vllm_mlx/api/models.py
@@ -172,12 +172,16 @@ class ChatCompletionRequest(BaseModel):
     # MLLM-specific parameters
     video_fps: float | None = None
     video_max_frames: int | None = None
+    # Sampling penalties
+    repetition_penalty: float | None = None  # mlx-lm style (>1.0 penalizes)
     # Request timeout in seconds (None = use server default)
     timeout: float | None = None
     # SpecPrefill: per-request enable/disable (None = server decides)
     specprefill: bool | None = None
     # SpecPrefill: per-request keep percentage (0.0-1.0, None = use server default)
     specprefill_keep_pct: float | None = None
+    # Enable/disable thinking mode (None = server default, typically True)
+    enable_thinking: bool | None = None
 
 
 class AssistantMessage(BaseModel):
@@ -239,6 +243,8 @@ class CompletionRequest(BaseModel):
     max_tokens: int | None = None
     stream: bool = False
     stop: list[str] | None = None
+    # Sampling penalties
+    repetition_penalty: float | None = None  # mlx-lm style (>1.0 penalizes)
     # Request timeout in seconds (None = use server default)
     timeout: float | None = None
 

diff --git a/vllm_mlx/api/utils.py b/vllm_mlx/api/utils.py
@@ -20,7 +20,9 @@
     r"<\|im_end\|>|<\|im_start\|>|<\|endoftext\|>|"
     r"<\|end\|>|<\|eot_id\|>|<\|start_header_id\|>|<\|end_header_id\|>|"
     r"<\|channel\|>|<\|message\|>|<\|start\|>|<\|return\|>|<\|call\|>|<\|constrain\|>|"
-    r"</s>|<s>|<pad>|\[PAD\]|\[SEP\]|\[CLS\]"
+    r"</s>|<s>|<pad>|\[PAD\]|\[SEP\]|\[CLS\]|"
+    r"\[e~\[|\]~b\][a-z]*|\]~!b\[|"
+    r"</?tool_call>|</?tool_call_reasoning>"
 )
 
 
@@ -356,6 +358,8 @@ def flush(self) -> list[tuple[str, str]]:
     "InternVL",  # InternVL
     "deepseek-vl",
     "DeepSeek-VL",  # DeepSeek-VL
+    "Qwen3.5-",
+    "qwen3_5",  # Qwen3.5 MoE (natively multimodal, hybrid ArraysCache+KVCache)
 ]
 
 

diff --git a/vllm_mlx/cli.py b/vllm_mlx/cli.py
@@ -37,6 +37,13 @@ def serve_command(args):
         print("Example: --enable-auto-tool-choice --tool-call-parser mistral")
         sys.exit(1)
 
+    # Validate gpu-memory-utilization range
+    if not (0.0 < args.gpu_memory_utilization <= 1.0):
+        print(
+            "Error: --gpu-memory-utilization must be between 0.0 (exclusive) and 1.0 (inclusive)"
+        )
+        sys.exit(1)
+
     # Configure server security settings
     server._api_key = args.api_key
     server._default_timeout = args.timeout
@@ -196,7 +203,8 @@ def serve_command(args):
         scheduler_config=scheduler_config,
         stream_interval=args.stream_interval if args.continuous_batching else 1,
         max_tokens=args.max_tokens,
-        force_mllm=args.mllm,
+        force_mllm=getattr(args, "mllm", False),
+        gpu_memory_utilization=args.gpu_memory_utilization,
         served_model_name=args.served_model_name,
         mtp=args.enable_mtp,
         prefill_step_size=args.prefill_step_size,
@@ -704,6 +712,14 @@ def main():
         action="store_true",
         help="Enable continuous batching for multiple concurrent users (slower for single user)",
     )
+    serve_parser.add_argument(
+        "--gpu-memory-utilization",
+        type=float,
+        default=0.90,
+        help="Fraction of device memory for Metal allocation limit and emergency "
+        "cache clear threshold (0.0-1.0, default: 0.90). Increase to 0.95 for "
+        "large models (200GB+) that need more memory headroom.",
+    )
     # Paged cache options (experimental)
     serve_parser.add_argument(
         "--use-paged-cache",
@@ -838,12 +854,14 @@ def main():
             "nemotron",
             "xlam",
             "functionary",
+            "gemma4",
             "glm47",
+            "minimax",
         ],
         help=(
             "Select the tool call parser for the model. Options: "
             "auto (auto-detect), mistral, qwen, qwen3_coder, llama, hermes, "
-            "deepseek, kimi, granite, nemotron, xlam, functionary, glm47. "
+            "deepseek, gemma4, kimi, granite, nemotron, xlam, functionary, glm47, minimax. "
             "Required for --enable-auto-tool-choice."
         ),
     )