Retrieve sliding_window from text config in Gemma3 MM (vllm-project#25085)

hmellor · web-flow · commit f4cd80f94404 · 2025-09-18T06:29:05.000Z
Signed-off-by: Harry Mellor &lt;19981378+hmellor@users.noreply.github.com&gt;
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
@@ -688,7 +688,8 @@ def prepare_attn_masks(
             global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
             global_attn_masks.append(global_attn_mask)
 
-            if (sliding_window := self.config.sliding_window) is not None:
+            sliding_window = self.config.text_config.sliding_window
+            if sliding_window is not None:
                 # Create a local causal mask with sliding window (1024).
                 local_attn_mask = torch.ones_like(global_attn_mask)
                 local_attn_mask = torch.tril(local_attn_mask,
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
@@ -461,9 +461,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.multimodal_config = multimodal_config
         self.vocab_size = config.text_config.vocab_size
 
-        self.sliding_window = getattr(config.text_config,
-                                      "interleaved_sliding_window", None)
-
         self.vision_tower = AutoModel.from_config(config=config.vision_config)
         self.audio_tower = AutoModel.from_config(config=config.audio_config)
         self.embed_vision = Gemma3nMultimodalEmbedder(config.vision_config,