Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 17 additions & 8 deletions vllm_mlx/engine/batched.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,15 +177,24 @@ async def start(self) -> None:
from ..scheduler import SchedulerConfig
import os

# Auto-configure Gemma 3 for continuous batching compatibility
# Gemma 3's RotatingKVCache with sliding_window causes garbled output
# in batch mode due to offset tracking corruption. Force full KVCache.
if self._is_mllm and ("gemma-3" in self._model_name.lower() or "gemma3" in self._model_name.lower()):
if os.environ.get("GEMMA3_SLIDING_WINDOW") is None:
os.environ["GEMMA3_SLIDING_WINDOW"] = "0"
# Note on Gemma 3 sliding window configuration:
# - Default sliding_window=1024 works for multimodal (image+text)
# - GEMMA3_SLIDING_WINDOW=0 (full KVCache) enables extended text context
# but BREAKS multimodal generation with longer prompts (~1300+ tokens)
#
# Do NOT auto-set GEMMA3_SLIDING_WINDOW=0 for MLLM models.
# Users who need extended text-only context can manually set:
# GEMMA3_SLIDING_WINDOW=0 (but avoid multimodal with long prompts)
if ("gemma-3" in self._model_name.lower() or "gemma3" in self._model_name.lower()):
sliding_window = os.environ.get("GEMMA3_SLIDING_WINDOW")
if sliding_window is not None:
logger.info(
"Auto-set GEMMA3_SLIDING_WINDOW=0 for continuous batching compatibility. "
"This uses full KVCache for all layers (~35GB at 50K tokens)."
f"Gemma 3: Using GEMMA3_SLIDING_WINDOW={sliding_window} "
f"(Note: value 0 may cause issues with multimodal + long prompts)"
)
else:
logger.info(
"Gemma 3: Using default sliding_window=1024 (optimal for multimodal)"
)

# Load model and tokenizer
Expand Down