waybarrios · janhilgard · Apr 6, 2026
diff --git a/vllm_mlx/mllm_batch_generator.py b/vllm_mlx/mllm_batch_generator.py
@@ -757,6 +757,8 @@ def _next(self) -> List[MLLMBatchResponse]:
         # merged into a single BatchKVCache. Merging into an active batch
         # mid-generation would cause shape mismatches in attention layers,
         # so queued requests wait until the current batch finishes.
+        # Exception: text-only requests can be extended into an active batch
+        # via the elif branch below (they skip vision encoding entirely).
         if num_active == 0:
             requests = self.unprocessed_requests[: self.completion_batch_size]
 
@@ -769,6 +771,23 @@ def _next(self) -> List[MLLMBatchResponse]:
             self.active_batch = new_batch
             prompt_processing = True
 
+        # Mid-batch extend: text-only requests can join an active batch
+        # without vision encoding (no shape mismatch risk).
+        elif self.unprocessed_requests:
+            text_only = [
+                r for r in self.unprocessed_requests if not r.images and not r.videos
+            ][: self.completion_batch_size]
+
+            if text_only:
+                new_batch = self._process_prompts(text_only)
+                processed_uids = {r.uid for r in text_only}
+                self.unprocessed_requests = [
+                    r for r in self.unprocessed_requests if r.uid not in processed_uids
+                ]
+                if new_batch is not None:
+                    batch.extend(new_batch)
+                prompt_processing = True
+
         # Generate next token for active batch
         batch = self.active_batch
         if batch is None: