vllm-project · weijinqian0 · Feb 5, 2026 · Feb 1, 2026 · Feb 5, 2026
@@ -727,7 +727,7 @@ def _forward_fia_slidingwindow(self, query: torch.Tensor, attn_metadata: AscendM
             key = self.key_cache.flatten(2, 3).contiguous()
             value = self.value_cache.flatten(2, 3).contiguous()
 
-        output, _ = torch_npu.npu_fused_infer_attention_score(
+        attn_output, _ = torch_npu.npu_fused_infer_attention_score(
             query,
             key,
             value,
@@ -742,7 +742,8 @@ def _forward_fia_slidingwindow(self, query: torch.Tensor, attn_metadata: AscendM
             actual_seq_lengths_kv=attn_metadata.seq_lens,
         )
 
-        output = output.view(batch_size, self.num_heads, self.head_size)
+        attn_output = attn_output.view(batch_size, self.num_heads, self.head_size)
+        output[:batch_size] = attn_output[:batch_size]
         return output
 
     def forward_fused_infer_attention(