diff --git a/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py b/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py index 2fcdbf2a8d1c..e2928ff95a1b 100644 --- a/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py +++ b/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py @@ -1041,8 +1041,8 @@ def forward_extend( num_heads=layer.tp_q_head_num, num_key_value_heads=layer.tp_k_head_num, input_layout="BSND", # todo, TND not supports q_heads!=k_heads - atten_mask=self.fia_mask.unsqueeze(0), - sparse_mode=3 if q_len != 1 else 0, + atten_mask=self.fia_mask, + sparse_mode=3, scale=layer.scaling, next_tokens=0, )[0]