vllm-project · Isotr0py · Dec 19, 2025 · Dec 17, 2025 · Dec 18, 2025 · Dec 18, 2025
diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
@@ -357,6 +357,12 @@ def kernel_unified_attention_2d(
         L = L * alpha + l_j
         M = m_j
 
+        if SLIDING_WINDOW:
+            qpos_lo = q_block_local_idx * BLOCK_Q
+            V = tl.where(
+                (context_len + qpos_lo - seq_offset[:, None]) < SLIDING_WINDOW, V, 0.0
+            )
+
         # acc : (BLOCK_M, HEAD_SIZE_PADDED)
         acc += tl.dot(P.to(V.dtype), V)
 
@@ -672,6 +678,12 @@ def kernel_unified_attention_3d(
         L = L * alpha + l_j
         M = m_j
 
+        if SLIDING_WINDOW:
+            qpos_lo = q_block_local_idx * BLOCK_Q
+            V = tl.where(
+                (context_len + qpos_lo - seq_offset[:, None]) < SLIDING_WINDOW, V, 0.0
+            )
+
         # acc : (BLOCK_M, HEAD_SIZE_PADDED)
         acc += tl.dot(P.to(V.dtype), V)