diff --git a/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py b/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py index e6e7dd5ccbad..f22474468e1a 100644 --- a/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py +++ b/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py @@ -840,7 +840,7 @@ def forward_extend( sinks: Optional[torch.Tensor] = None, slopes: Optional[torch.Tensor] = None, ): - if is_mla_preprocess_enabled(): + if is_mla_preprocess_enabled() and self.use_mla: # MLAPO and MLAPROLOG do save kv_cache save_kv_cache = False if self.is_dllm_model: @@ -1748,7 +1748,7 @@ def forward_decode( sinks: Optional[torch.Tensor] = None, slopes: Optional[torch.Tensor] = None, ): - if is_mla_preprocess_enabled(): + if is_mla_preprocess_enabled() and self.use_mla: # MLAPO does saving kv_cache save_kv_cache = False if topk_indices is not None: