From 5c2c72a9315cc14ce39fbea05e0dce2e4dd5dde1 Mon Sep 17 00:00:00 2001 From: Estrella-xx <820167017@qq.com> Date: Tue, 24 Mar 2026 20:52:09 +0800 Subject: [PATCH] fix the bug where q_len==1 when chunk prefill is enabled --- .../srt/hardware_backend/npu/attention/ascend_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py b/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py index 2fcdbf2a8d1c..e2928ff95a1b 100644 --- a/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py +++ b/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py @@ -1041,8 +1041,8 @@ def forward_extend( num_heads=layer.tp_q_head_num, num_key_value_heads=layer.tp_k_head_num, input_layout="BSND", # todo, TND not supports q_heads!=k_heads - atten_mask=self.fia_mask.unsqueeze(0), - sparse_mode=3 if q_len != 1 else 0, + atten_mask=self.fia_mask, + sparse_mode=3, scale=layer.scaling, next_tokens=0, )[0]