waybarrios · waybarrios · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "vllm-mlx"
-version = "0.2.7"
+version = "0.2.8"
 description = "vLLM-like inference for Apple Silicon - GPU-accelerated Text, Image, Video & Audio on Mac"
 readme = "README.md"
 license = {text = "Apache-2.0"}

diff --git a/vllm_mlx/scheduler.py b/vllm_mlx/scheduler.py
@@ -1256,15 +1256,25 @@ def _prefill_progress(progress_list):
             prefill_batch_size=self.config.prefill_batch_size,
             completion_batch_size=self.config.completion_batch_size,
             prefill_step_size=self.config.prefill_step_size,
-            prompt_progress_callback=_prefill_progress,
         )
+        # Set callback as attribute — used by _install_chunked_prefill
+        # monkey-patch. Not a BatchGenerator constructor parameter.
+        bg.prompt_progress_callback = _prefill_progress
 
         # Install chunked prefill when explicitly configured OR when
         # memory-aware cache is active (needed for prefix_boundary saves
         # in agentic multi-turn workloads with hybrid Mamba+Transformer models).
         chunked_budget = self.config.chunked_prefill_tokens
         need_chunked = chunked_budget > 0 or self.memory_aware_cache is not None
-        if need_chunked:
+
+        # The chunked prefill monkey-patch relies on BatchGenerator internals
+        # (_process_prompts, active_batch, _step, etc.) that were refactored
+        # in mlx-lm 0.31.x.  Skip gracefully when the required API is absent.
+        chunked_compatible = hasattr(bg, "_process_prompts") and hasattr(
+            bg, "active_batch"
+        )
+
+        if need_chunked and chunked_compatible:
             if chunked_budget <= 0:
                 # No explicit budget — use a very large value so normal
                 # prompts pass through unchanged.  Prefix boundary splits
@@ -1287,6 +1297,12 @@ def _prefill_progress(progress_list):
                 uid_to_request_id=self.uid_to_request_id,
                 requests=self.requests,
             )
+        elif need_chunked and not chunked_compatible:
+            logger.warning(
+                "Chunked prefill disabled: mlx-lm BatchGenerator lacks required "
+                "internals (_process_prompts, active_batch). Upgrade mlx-lm or "
+                "check compatibility."
+            )
 
         # Install MTP if the model supports it
         if self.config.enable_mtp:
@@ -2334,9 +2350,16 @@ def step(self, max_retries: int = 1) -> SchedulerOutput:
 
                 # Run generation step if we have running requests
                 if self.batch_generator is not None and self.running:
-                    responses = self.batch_generator.next()
+                    result = self.batch_generator.next()
                     output.has_work = True
 
+                    # mlx-lm >=0.31.x returns (prompt_responses, generation_responses);
+                    # older versions returned a flat list.
+                    if isinstance(result, tuple):
+                        responses = result[1]  # generation_responses only
+                    else:
+                        responses = result
+
                     if responses:
                         outputs, finished_ids = self._process_batch_responses(responses)
                         output.outputs = outputs
@@ -2403,6 +2426,7 @@ def step(self, max_retries: int = 1) -> SchedulerOutput:
             # Evaluate batch tokens to collapse lazy concatenation chains
             if (
                 self.batch_generator is not None
+                and hasattr(self.batch_generator, "active_batch")
                 and self.batch_generator.active_batch is not None
                 and hasattr(self.batch_generator.active_batch, "tokens")
             ):