diff --git a/tests/e2e/310p/multicard/test_dense_model_multicard.py b/tests/e2e/310p/multicard/test_dense_model_multicard.py index e964c48fc76..2d8d008f9c6 100644 --- a/tests/e2e/310p/multicard/test_dense_model_multicard.py +++ b/tests/e2e/310p/multicard/test_dense_model_multicard.py @@ -24,23 +24,26 @@ def test_qwen3_dense_tp2_fp16(): ] max_tokens = 5 with VllmRunner( - "Qwen/Qwen3-8B", - tensor_parallel_size=2, - enforce_eager=True, - dtype="float16" + "Qwen/Qwen3-8B", + tensor_parallel_size=2, + enforce_eager=True, + dtype="float16", + max_model_len=16384, ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) + def test_qwen3_dense_tp4_w8a8(): example_prompts = [ "Hello, my name is", ] max_tokens = 5 with VllmRunner( - "vllm-ascend/Qwen3-32B-W8A8", - tensor_parallel_size=4, - enforce_eager=True, - dtype="float16", - quantization="ascend" + "vllm-ascend/Qwen3-32B-W8A8", + tensor_parallel_size=4, + enforce_eager=True, + dtype="float16", + quantization="ascend", + max_model_len=16384, ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/310p/multicard/test_moe_model_multicard.py b/tests/e2e/310p/multicard/test_moe_model_multicard.py index a2cb9940960..f761acee58c 100644 --- a/tests/e2e/310p/multicard/test_moe_model_multicard.py +++ b/tests/e2e/310p/multicard/test_moe_model_multicard.py @@ -24,37 +24,42 @@ def test_qwen3_moe_tp4_fp16(): ] max_tokens = 5 with VllmRunner( - "Qwen/Qwen3-30B-A3B", - tensor_parallel_size=4, - enforce_eager=True, - dtype="float16" + "Qwen/Qwen3-30B-A3B", + tensor_parallel_size=4, + enforce_eager=True, + dtype="float16", + max_model_len=16384, ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) + def test_qwen3_moe_ep4_fp16(): example_prompts = [ "Hello, my name is", ] max_tokens = 5 with VllmRunner( - "Qwen/Qwen3-30B-A3B", - tensor_parallel_size=4, - enforce_eager=True, - dtype="float16", - enable_expert_parallel=True + "Qwen/Qwen3-30B-A3B", + tensor_parallel_size=4, + enforce_eager=True, + dtype="float16", + enable_expert_parallel=True, + max_model_len=16384, ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) + def test_qwen3_moe_tp2_w8a8(): example_prompts = [ "Hello, my name is", ] max_tokens = 5 with VllmRunner( - "vllm-ascend/Qwen3-30B-A3B-W8A8", - tensor_parallel_size=2, - enforce_eager=True, - dtype="float16", - quantization="ascend" + "vllm-ascend/Qwen3-30B-A3B-W8A8", + tensor_parallel_size=2, + enforce_eager=True, + dtype="float16", + quantization="ascend", + max_model_len=16384, ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/310p/singlecard/test_dense_model_singlecard.py b/tests/e2e/310p/singlecard/test_dense_model_singlecard.py index a557f577173..26cc05a4ea2 100644 --- a/tests/e2e/310p/singlecard/test_dense_model_singlecard.py +++ b/tests/e2e/310p/singlecard/test_dense_model_singlecard.py @@ -24,23 +24,26 @@ def test_qwen3_dense_tp1_fp16(): ] max_tokens = 5 with VllmRunner( - "Qwen/Qwen3-8B", - tensor_parallel_size=1, - enforce_eager=True, - dtype="float16" + "Qwen/Qwen3-8B", + tensor_parallel_size=1, + enforce_eager=True, + dtype="float16", + max_model_len=16384, ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) + def test_qwen3_dense_tp1_w8a8(): example_prompts = [ "Hello, my name is", ] max_tokens = 5 with VllmRunner( - "vllm-ascend/Qwen3-8B-W8A8", - tensor_parallel_size=1, - enforce_eager=True, - dtype="float16", - quantization="ascend" + "vllm-ascend/Qwen3-8B-W8A8", + tensor_parallel_size=1, + enforce_eager=True, + dtype="float16", + quantization="ascend", + max_model_len=16384, ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/ut/_310p/attention/test_attention_mask_310.py b/tests/ut/_310p/attention/test_attention_mask_310.py index e3b1f284c75..c56b0a2fd3e 100644 --- a/tests/ut/_310p/attention/test_attention_mask_310.py +++ b/tests/ut/_310p/attention/test_attention_mask_310.py @@ -23,7 +23,8 @@ class TestAttentionMaskBuilder310(TestBase): def setUp(self): - self.attention_mask_builder = AttentionMaskBuilder310(torch.device("cpu")) + self.max_seqlen = 4096 + self.attention_mask_builder = AttentionMaskBuilder310(torch.device("cpu"), self.max_seqlen) def test_get_attention_mask_310_for_pooling_model(self): model_config = MagicMock() @@ -36,7 +37,7 @@ def test_get_attention_mask_310(self, mock_format_cast): mock_format_cast.side_effect = lambda x, y: x model_config = MagicMock() attn_mask = self.attention_mask_builder.get_attention_mask(model_config) - self.assertEqual(attn_mask.shape, (1, 128, 2048, 16)) + self.assertEqual(attn_mask.shape, (1, self.max_seqlen // 16, self.max_seqlen, 16)) self.assertEqual(attn_mask[0][-1][0][-1], torch.tensor(float("-inf"), dtype=torch.float16)) @patch("torch_npu.npu_format_cast") @@ -47,7 +48,7 @@ def test_get_swa_mask_310(self, mock_format_cast): sliding_window = 128 swa_mask = self.attention_mask_builder.get_swa_mask(torch.float16, sliding_window) - self.assertEqual(swa_mask.shape, (1, 128, 2048, 16)) + self.assertEqual(swa_mask.shape, (1, self.max_seqlen // 16, self.max_seqlen, 16)) self.assertEqual(swa_mask[0][-1][0][-1], torch.tensor(float("-inf"), dtype=torch.float16)) self.assertEqual(swa_mask[0][0][-1][0], torch.tensor(float("-inf"), dtype=torch.float16)) @@ -58,4 +59,4 @@ def test_get_splitfuse_attn_mask_310(self, mock_format_cast): attn_metadata.query_start_loc = torch.tensor([0, 1, 5]) attn_metadata.seq_lens = torch.tensor([7, 4]) attn_mask = self.attention_mask_builder.get_splitfuse_mask(attn_metadata, torch.device("cpu")) - self.assertEqual(attn_mask.shape, (1, 128, 16, 16)) + self.assertEqual(attn_mask.shape, (1, self.max_seqlen // 16, 16, 16)) diff --git a/vllm_ascend/_310p/attention/attention_mask.py b/vllm_ascend/_310p/attention/attention_mask.py index 7fec30efce0..ed67d126059 100644 --- a/vllm_ascend/_310p/attention/attention_mask.py +++ b/vllm_ascend/_310p/attention/attention_mask.py @@ -24,19 +24,20 @@ class AttentionMaskBuilder310: chunked_prefill_attn_mask = None - max_seqlen = 2048 + max_seqlen = 16384 - def __init__(self, device: torch.device): + def __init__(self, device: torch.device, max_seqlen: int): """ Initializes the AttentionMaskBuilder for the 310P device. Args: device (torch.device): The device on which tensors will be allocated. + max_seqlen (int): Maximum length of a sequence (including prompt and generated text). """ + AttentionMaskBuilder310.max_seqlen = max_seqlen self.attn_mask_cache = None self.device = device self.swa_mask = None - self._seq_len_cached = 0 @staticmethod def gen_causal_additive_mask(max_seq_len: int, device: torch.device): @@ -147,8 +148,7 @@ def _get_causal_mask(self, max_seq_len: int) -> torch.Tensor: Returns: torch.Tensor: The cached causal mask in ACL_FORMAT_FRACTAL_NZ. """ - if self.attn_mask_cache is None or max_seq_len > self._seq_len_cached: + if self.attn_mask_cache is None: attn_mask = self.gen_causal_additive_mask(max_seq_len, self.device) self.attn_mask_cache = torch_npu.npu_format_cast(nd_to_nz_2d(attn_mask), ACL_FORMAT_FRACTAL_NZ) - self._seq_len_cached = max_seq_len return self.attn_mask_cache diff --git a/vllm_ascend/_310p/attention/attention_v1.py b/vllm_ascend/_310p/attention/attention_v1.py index c080d842e9f..734d594e0ea 100644 --- a/vllm_ascend/_310p/attention/attention_v1.py +++ b/vllm_ascend/_310p/attention/attention_v1.py @@ -41,7 +41,6 @@ def __init__(self, *args, **kwargs): Initializes the 310P backend and sets up the device-specific mask builder. """ super().__init__(*args, **kwargs) - self.attn_mask_builder = AttentionMaskBuilder310(self.device) @staticmethod def get_kv_cache_shape(num_blocks: int, block_size: int, num_kv_heads: int, head_size: int): diff --git a/vllm_ascend/_310p/attention/metadata_builder.py b/vllm_ascend/_310p/attention/metadata_builder.py index 5e43ac63f85..64371c620d5 100644 --- a/vllm_ascend/_310p/attention/metadata_builder.py +++ b/vllm_ascend/_310p/attention/metadata_builder.py @@ -54,4 +54,5 @@ def __init__( super().__init__(kv_cache_spec, layer_names, vllm_config, device) # Override the mask builder with the 310P-specific version - self.attn_mask_builder: Any = AttentionMaskBuilder310(self.device) + max_model_len = vllm_config.model_config.max_model_len + self.attn_mask_builder: Any = AttentionMaskBuilder310(self.device, max_model_len) diff --git a/vllm_ascend/_310p/fused_moe/fused_moe.py b/vllm_ascend/_310p/fused_moe/fused_moe.py index 42af44c33cd..3dfa072e108 100644 --- a/vllm_ascend/_310p/fused_moe/fused_moe.py +++ b/vllm_ascend/_310p/fused_moe/fused_moe.py @@ -27,6 +27,7 @@ from vllm_ascend.ops.fused_moe.experts_selector import zero_experts_compute from vllm_ascend.ops.fused_moe.moe_comm_method import FusedExpertsResult, _MoECommMethods from vllm_ascend.quantization.methods.base import QuantType +from vllm_ascend.utils import vllm_version_is from .experts_selector import select_experts from .moe_comm_method import AllGatherCommImpl310 @@ -153,6 +154,26 @@ def __init__(self, *args, **kwargs): _MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl310(self.moe_config) + if not vllm_version_is("0.15.0"): + self.runner = self._init_runner() + + if not vllm_version_is("0.15.0"): + + def _init_runner(self): + from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner + + return AscendMoERunner( + layer=self, + moe_config=self.moe_config, + router=self.router, + routed_input_transform=self._routed_input_transform, + gate=self.gate, + shared_experts=self.shared_experts, + quant_method=self.quant_method, + reduce_results=self.reduce_results, + enable_dbo=self.vllm_config.parallel_config.enable_dbo, + ) + def init_experts_map(self, moe_config): """ Initialize expert mapping for MoE (Mixture of Experts) model.