waybarrios · janhilgard · Feb 6, 2026 · Feb 6, 2026 · Feb 6, 2026 · Feb 6, 2026
diff --git a/vllm_mlx/engine/batched.py b/vllm_mlx/engine/batched.py
@@ -419,8 +419,9 @@ async def generate(
         if not self._loaded:
             await self.start()
 
-        if self._is_mllm and self._mllm_scheduler and (images or videos):
-            # Use MLLM scheduler for multimodal
+        if self._is_mllm and self._mllm_scheduler:
+            # Use MLLM scheduler for all requests on vision models
+            # (both multimodal and text-only, since LLM engine is not loaded for MLLM)
             output = await self._mllm_scheduler.generate(
                 prompt=prompt,
                 images=images,
@@ -437,7 +438,7 @@ async def generate(
                 finish_reason=output.finish_reason,
             )
 
-        # Use LLM engine for text-only
+        # Use LLM engine for text-only (non-MLLM models)
         from ..request import SamplingParams
 
         sampling_params = SamplingParams(
@@ -491,8 +492,9 @@ async def stream_generate(
         if not self._loaded:
             await self.start()
 
-        if self._is_mllm and self._mllm_scheduler and (images or videos):
-            # Use MLLM scheduler for multimodal streaming
+        if self._is_mllm and self._mllm_scheduler:
+            # Use MLLM scheduler for all requests on vision models
+            # (both multimodal and text-only, since LLM engine is not loaded for MLLM)
             request_id = await self._mllm_scheduler.add_request_async(
                 prompt=prompt,
                 images=images,
@@ -513,7 +515,7 @@ async def stream_generate(
                 )
             return
 
-        # Use LLM engine for text-only
+        # Use LLM engine for text-only (non-MLLM models)
         from ..request import SamplingParams
 
         sampling_params = SamplingParams(