Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
470 changes: 470 additions & 0 deletions scripts/add_mtp_weights_qwen35.py

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions vllm_mlx/api/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,12 +172,16 @@ class ChatCompletionRequest(BaseModel):
# MLLM-specific parameters
video_fps: float | None = None
video_max_frames: int | None = None
# Sampling penalties
repetition_penalty: float | None = None # mlx-lm style (>1.0 penalizes)
# Request timeout in seconds (None = use server default)
timeout: float | None = None
# SpecPrefill: per-request enable/disable (None = server decides)
specprefill: bool | None = None
# SpecPrefill: per-request keep percentage (0.0-1.0, None = use server default)
specprefill_keep_pct: float | None = None
# Enable/disable thinking mode (None = server default, typically True)
enable_thinking: bool | None = None


class AssistantMessage(BaseModel):
Expand Down Expand Up @@ -239,6 +243,8 @@ class CompletionRequest(BaseModel):
max_tokens: int | None = None
stream: bool = False
stop: list[str] | None = None
# Sampling penalties
repetition_penalty: float | None = None # mlx-lm style (>1.0 penalizes)
# Request timeout in seconds (None = use server default)
timeout: float | None = None

Expand Down
6 changes: 5 additions & 1 deletion vllm_mlx/api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
r"<\|im_end\|>|<\|im_start\|>|<\|endoftext\|>|"
r"<\|end\|>|<\|eot_id\|>|<\|start_header_id\|>|<\|end_header_id\|>|"
r"<\|channel\|>|<\|message\|>|<\|start\|>|<\|return\|>|<\|call\|>|<\|constrain\|>|"
r"</s>|<s>|<pad>|\[PAD\]|\[SEP\]|\[CLS\]"
r"</s>|<s>|<pad>|\[PAD\]|\[SEP\]|\[CLS\]|"
r"\[e~\[|\]~b\][a-z]*|\]~!b\[|"
r"</?tool_call>|</?tool_call_reasoning>"
)


Expand Down Expand Up @@ -356,6 +358,8 @@ def flush(self) -> list[tuple[str, str]]:
"InternVL", # InternVL
"deepseek-vl",
"DeepSeek-VL", # DeepSeek-VL
"Qwen3.5-",
"qwen3_5", # Qwen3.5 MoE (natively multimodal, hybrid ArraysCache+KVCache)
]


Expand Down
22 changes: 20 additions & 2 deletions vllm_mlx/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ def serve_command(args):
print("Example: --enable-auto-tool-choice --tool-call-parser mistral")
sys.exit(1)

# Validate gpu-memory-utilization range
if not (0.0 < args.gpu_memory_utilization <= 1.0):
print(
"Error: --gpu-memory-utilization must be between 0.0 (exclusive) and 1.0 (inclusive)"
)
sys.exit(1)

# Configure server security settings
server._api_key = args.api_key
server._default_timeout = args.timeout
Expand Down Expand Up @@ -196,7 +203,8 @@ def serve_command(args):
scheduler_config=scheduler_config,
stream_interval=args.stream_interval if args.continuous_batching else 1,
max_tokens=args.max_tokens,
force_mllm=args.mllm,
force_mllm=getattr(args, "mllm", False),
gpu_memory_utilization=args.gpu_memory_utilization,
served_model_name=args.served_model_name,
mtp=args.enable_mtp,
prefill_step_size=args.prefill_step_size,
Expand Down Expand Up @@ -704,6 +712,14 @@ def main():
action="store_true",
help="Enable continuous batching for multiple concurrent users (slower for single user)",
)
serve_parser.add_argument(
"--gpu-memory-utilization",
type=float,
default=0.90,
help="Fraction of device memory for Metal allocation limit and emergency "
"cache clear threshold (0.0-1.0, default: 0.90). Increase to 0.95 for "
"large models (200GB+) that need more memory headroom.",
)
# Paged cache options (experimental)
serve_parser.add_argument(
"--use-paged-cache",
Expand Down Expand Up @@ -838,12 +854,14 @@ def main():
"nemotron",
"xlam",
"functionary",
"gemma4",
"glm47",
"minimax",
],
help=(
"Select the tool call parser for the model. Options: "
"auto (auto-detect), mistral, qwen, qwen3_coder, llama, hermes, "
"deepseek, kimi, granite, nemotron, xlam, functionary, glm47. "
"deepseek, gemma4, kimi, granite, nemotron, xlam, functionary, glm47, minimax. "
"Required for --enable-auto-tool-choice."
),
)
Expand Down
Loading
Loading