diff --git a/pyproject.toml b/pyproject.toml index ed5b2deff..1191954c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "vllm-mlx" -version = "0.2.7" +version = "0.2.8" description = "vLLM-like inference for Apple Silicon - GPU-accelerated Text, Image, Video & Audio on Mac" readme = "README.md" license = {text = "Apache-2.0"} diff --git a/vllm_mlx/scheduler.py b/vllm_mlx/scheduler.py index 32057c19a..520d19d53 100644 --- a/vllm_mlx/scheduler.py +++ b/vllm_mlx/scheduler.py @@ -1256,15 +1256,25 @@ def _prefill_progress(progress_list): prefill_batch_size=self.config.prefill_batch_size, completion_batch_size=self.config.completion_batch_size, prefill_step_size=self.config.prefill_step_size, - prompt_progress_callback=_prefill_progress, ) + # Set callback as attribute — used by _install_chunked_prefill + # monkey-patch. Not a BatchGenerator constructor parameter. + bg.prompt_progress_callback = _prefill_progress # Install chunked prefill when explicitly configured OR when # memory-aware cache is active (needed for prefix_boundary saves # in agentic multi-turn workloads with hybrid Mamba+Transformer models). chunked_budget = self.config.chunked_prefill_tokens need_chunked = chunked_budget > 0 or self.memory_aware_cache is not None - if need_chunked: + + # The chunked prefill monkey-patch relies on BatchGenerator internals + # (_process_prompts, active_batch, _step, etc.) that were refactored + # in mlx-lm 0.31.x. Skip gracefully when the required API is absent. + chunked_compatible = hasattr(bg, "_process_prompts") and hasattr( + bg, "active_batch" + ) + + if need_chunked and chunked_compatible: if chunked_budget <= 0: # No explicit budget — use a very large value so normal # prompts pass through unchanged. Prefix boundary splits @@ -1287,6 +1297,12 @@ def _prefill_progress(progress_list): uid_to_request_id=self.uid_to_request_id, requests=self.requests, ) + elif need_chunked and not chunked_compatible: + logger.warning( + "Chunked prefill disabled: mlx-lm BatchGenerator lacks required " + "internals (_process_prompts, active_batch). Upgrade mlx-lm or " + "check compatibility." + ) # Install MTP if the model supports it if self.config.enable_mtp: @@ -2334,9 +2350,16 @@ def step(self, max_retries: int = 1) -> SchedulerOutput: # Run generation step if we have running requests if self.batch_generator is not None and self.running: - responses = self.batch_generator.next() + result = self.batch_generator.next() output.has_work = True + # mlx-lm >=0.31.x returns (prompt_responses, generation_responses); + # older versions returned a flat list. + if isinstance(result, tuple): + responses = result[1] # generation_responses only + else: + responses = result + if responses: outputs, finished_ids = self._process_batch_responses(responses) output.outputs = outputs @@ -2403,6 +2426,7 @@ def step(self, max_retries: int = 1) -> SchedulerOutput: # Evaluate batch tokens to collapse lazy concatenation chains if ( self.batch_generator is not None + and hasattr(self.batch_generator, "active_batch") and self.batch_generator.active_batch is not None and hasattr(self.batch_generator.active_batch, "tokens") ):