From 0703d4022339605769af631c40b7d3c6a1f21c90 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Wed, 17 Dec 2025 11:40:31 -0500 Subject: [PATCH 1/2] Mask out V blocks that are out of sliding window Signed-off-by: Thomas Parnell --- vllm/attention/ops/triton_unified_attention.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index a1877bb4429b..999ff4439824 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -618,6 +618,12 @@ def kernel_unified_attention_3d( L = L * alpha + l_j M = m_j + if SLIDING_WINDOW: + qpos_lo = q_block_local_idx * BLOCK_Q + V = tl.where( + (context_len + qpos_lo - seq_offset[:, None]) < SLIDING_WINDOW, V, 0.0 + ) + # acc : (BLOCK_M, HEAD_SIZE_PADDED) acc += tl.dot(P.to(V.dtype), V) From 7ce3a87d46d68d914dae48761c5720fb5995d184 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 18 Dec 2025 05:53:09 -0500 Subject: [PATCH 2/2] Apply same fix to 2D kernel Signed-off-by: Thomas Parnell --- vllm/attention/ops/triton_unified_attention.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index 999ff4439824..85169dc30a89 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -330,6 +330,12 @@ def kernel_unified_attention_2d( L = L * alpha + l_j M = m_j + if SLIDING_WINDOW: + qpos_lo = q_block_local_idx * BLOCK_Q + V = tl.where( + (context_len + qpos_lo - seq_offset[:, None]) < SLIDING_WINDOW, V, 0.0 + ) + # acc : (BLOCK_M, HEAD_SIZE_PADDED) acc += tl.dot(P.to(V.dtype), V)