[bug fix] Fix llama4 spec decoding (vllm-project#22691)

zixi-qi · houseroad · epwalsh · commit f1f41ad4e195 · 2025-08-27T16:55:39.000-07:00
Signed-off-by: qizixi &lt;qizixi@meta.com&gt;
Co-authored-by: Lu Fang &lt;30275821+houseroad@users.noreply.github.com&gt;
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
@@ -195,7 +195,9 @@ def __init__(self,
             is_neox_style=is_neox_style,
         ) if not self.nope else None
 
-        attn_cls = Attention if self.nope else ChunkedLocalAttention
+        use_chunked_local_attn = not self.nope and config.attention_chunk_size
+        attn_cls = (ChunkedLocalAttention
+                    if use_chunked_local_attn else Attention)
         self.attn = attn_cls(
             self.num_heads,
             self.head_dim,
@@ -206,7 +208,7 @@ def __init__(self,
             prefix=f"{prefix}.attn",
             **({
                 "attention_chunk_size": config.attention_chunk_size
-            } if not self.nope else {}))
+            } if use_chunked_local_attn else {}))
 
     def _get_attn_scale(self, positions: torch.Tensor) -> torch.Tensor:
         floor = torch.floor((positions + 1.0) / self.floor_scale)