Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "vllm-mlx"
version = "0.2.7"
version = "0.2.8"
description = "vLLM-like inference for Apple Silicon - GPU-accelerated Text, Image, Video & Audio on Mac"
readme = "README.md"
license = {text = "Apache-2.0"}
Expand Down
30 changes: 27 additions & 3 deletions vllm_mlx/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1256,15 +1256,25 @@ def _prefill_progress(progress_list):
prefill_batch_size=self.config.prefill_batch_size,
completion_batch_size=self.config.completion_batch_size,
prefill_step_size=self.config.prefill_step_size,
prompt_progress_callback=_prefill_progress,
)
# Set callback as attribute — used by _install_chunked_prefill
# monkey-patch. Not a BatchGenerator constructor parameter.
bg.prompt_progress_callback = _prefill_progress

# Install chunked prefill when explicitly configured OR when
# memory-aware cache is active (needed for prefix_boundary saves
# in agentic multi-turn workloads with hybrid Mamba+Transformer models).
chunked_budget = self.config.chunked_prefill_tokens
need_chunked = chunked_budget > 0 or self.memory_aware_cache is not None
if need_chunked:

# The chunked prefill monkey-patch relies on BatchGenerator internals
# (_process_prompts, active_batch, _step, etc.) that were refactored
# in mlx-lm 0.31.x. Skip gracefully when the required API is absent.
chunked_compatible = hasattr(bg, "_process_prompts") and hasattr(
bg, "active_batch"
)

if need_chunked and chunked_compatible:
if chunked_budget <= 0:
# No explicit budget — use a very large value so normal
# prompts pass through unchanged. Prefix boundary splits
Expand All @@ -1287,6 +1297,12 @@ def _prefill_progress(progress_list):
uid_to_request_id=self.uid_to_request_id,
requests=self.requests,
)
elif need_chunked and not chunked_compatible:
logger.warning(
"Chunked prefill disabled: mlx-lm BatchGenerator lacks required "
"internals (_process_prompts, active_batch). Upgrade mlx-lm or "
"check compatibility."
)

# Install MTP if the model supports it
if self.config.enable_mtp:
Expand Down Expand Up @@ -2334,9 +2350,16 @@ def step(self, max_retries: int = 1) -> SchedulerOutput:

# Run generation step if we have running requests
if self.batch_generator is not None and self.running:
responses = self.batch_generator.next()
result = self.batch_generator.next()
output.has_work = True

# mlx-lm >=0.31.x returns (prompt_responses, generation_responses);
# older versions returned a flat list.
if isinstance(result, tuple):
responses = result[1] # generation_responses only
else:
responses = result

if responses:
outputs, finished_ids = self._process_batch_responses(responses)
output.outputs = outputs
Expand Down Expand Up @@ -2403,6 +2426,7 @@ def step(self, max_retries: int = 1) -> SchedulerOutput:
# Evaluate batch tokens to collapse lazy concatenation chains
if (
self.batch_generator is not None
and hasattr(self.batch_generator, "active_batch")
and self.batch_generator.active_batch is not None
and hasattr(self.batch_generator.active_batch, "tokens")
):
Expand Down
Loading