From 5c2c72a9315cc14ce39fbea05e0dce2e4dd5dde1 Mon Sep 17 00:00:00 2001
From: Estrella-xx <820167017@qq.com>
Date: Tue, 24 Mar 2026 20:52:09 +0800
Subject: [PATCH] fix the bug where q_len==1 when chunk prefill is enabled

---
 .../srt/hardware_backend/npu/attention/ascend_backend.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py b/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py
index 2fcdbf2a8d1c..e2928ff95a1b 100644
--- a/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py
+++ b/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py
@@ -1041,8 +1041,8 @@ def forward_extend(
                             num_heads=layer.tp_q_head_num,
                             num_key_value_heads=layer.tp_k_head_num,
                             input_layout="BSND",  # todo, TND not supports q_heads!=k_heads
-                            atten_mask=self.fia_mask.unsqueeze(0),
-                            sparse_mode=3 if q_len != 1 else 0,
+                            atten_mask=self.fia_mask,
+                            sparse_mode=3,
                             scale=layer.scaling,
                             next_tokens=0,
                         )[0]