diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index a0fe9e0802c..97c02892f13 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -829,7 +829,7 @@ def get_supported_tasks(self) -> "tuple[SupportedTask, ...]": def _make_attention_mask(self, seq_lens, query_lens, position, attn_state) -> torch.Tensor: # Chunk Prefill situation. - if attn_state == AscendAttentionState.ChunkedPrefill: + if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla: return self.attn_mask_builder.get_splitfuse_attn_mask( seq_lens, query_lens, position, self.dtype, self.device) # Prefill without cache situation.