vllm-project · wangxiyuan · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026
@@ -24,23 +24,26 @@ def test_qwen3_dense_tp2_fp16():
     ]
     max_tokens = 5
     with VllmRunner(
-            "Qwen/Qwen3-8B",
-            tensor_parallel_size=2,
-            enforce_eager=True,
-            dtype="float16"
+        "Qwen/Qwen3-8B",
+        tensor_parallel_size=2,
+        enforce_eager=True,
+        dtype="float16",
+        max_model_len=16384,
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
+
 def test_qwen3_dense_tp4_w8a8():
     example_prompts = [
         "Hello, my name is",
     ]
     max_tokens = 5
     with VllmRunner(
-            "vllm-ascend/Qwen3-32B-W8A8",
-            tensor_parallel_size=4,
-            enforce_eager=True,
-            dtype="float16",
-            quantization="ascend"
+        "vllm-ascend/Qwen3-32B-W8A8",
+        tensor_parallel_size=4,
+        enforce_eager=True,
+        dtype="float16",
+        quantization="ascend",
+        max_model_len=16384,
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -24,37 +24,42 @@ def test_qwen3_moe_tp4_fp16():
     ]
     max_tokens = 5
     with VllmRunner(
-            "Qwen/Qwen3-30B-A3B",
-            tensor_parallel_size=4,
-            enforce_eager=True,
-            dtype="float16"
+        "Qwen/Qwen3-30B-A3B",
+        tensor_parallel_size=4,
+        enforce_eager=True,
+        dtype="float16",
+        max_model_len=16384,
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
+
 def test_qwen3_moe_ep4_fp16():
     example_prompts = [
         "Hello, my name is",
     ]
     max_tokens = 5
     with VllmRunner(
-            "Qwen/Qwen3-30B-A3B",
-            tensor_parallel_size=4,
-            enforce_eager=True,
-            dtype="float16",
-            enable_expert_parallel=True
+        "Qwen/Qwen3-30B-A3B",
+        tensor_parallel_size=4,
+        enforce_eager=True,
+        dtype="float16",
+        enable_expert_parallel=True,
+        max_model_len=16384,
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
+
 def test_qwen3_moe_tp2_w8a8():
     example_prompts = [
         "Hello, my name is",
     ]
     max_tokens = 5
     with VllmRunner(
-            "vllm-ascend/Qwen3-30B-A3B-W8A8",
-            tensor_parallel_size=2,
-            enforce_eager=True,
-            dtype="float16",
-            quantization="ascend"
+        "vllm-ascend/Qwen3-30B-A3B-W8A8",
+        tensor_parallel_size=2,
+        enforce_eager=True,
+        dtype="float16",
+        quantization="ascend",
+        max_model_len=16384,
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -24,23 +24,26 @@ def test_qwen3_dense_tp1_fp16():
     ]
     max_tokens = 5
     with VllmRunner(
-            "Qwen/Qwen3-8B",
-            tensor_parallel_size=1,
-            enforce_eager=True,
-            dtype="float16"
+        "Qwen/Qwen3-8B",
+        tensor_parallel_size=1,
+        enforce_eager=True,
+        dtype="float16",
+        max_model_len=16384,
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
+
 def test_qwen3_dense_tp1_w8a8():
     example_prompts = [
         "Hello, my name is",
     ]
     max_tokens = 5
     with VllmRunner(
-            "vllm-ascend/Qwen3-8B-W8A8",
-            tensor_parallel_size=1,
-            enforce_eager=True,
-            dtype="float16",
-            quantization="ascend"
+        "vllm-ascend/Qwen3-8B-W8A8",
+        tensor_parallel_size=1,
+        enforce_eager=True,
+        dtype="float16",
+        quantization="ascend",
+        max_model_len=16384,
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -23,7 +23,8 @@
 
 class TestAttentionMaskBuilder310(TestBase):
     def setUp(self):
-        self.attention_mask_builder = AttentionMaskBuilder310(torch.device("cpu"))
+        self.max_seqlen = 4096
+        self.attention_mask_builder = AttentionMaskBuilder310(torch.device("cpu"), self.max_seqlen)
 
     def test_get_attention_mask_310_for_pooling_model(self):
         model_config = MagicMock()
@@ -36,7 +37,7 @@ def test_get_attention_mask_310(self, mock_format_cast):
         mock_format_cast.side_effect = lambda x, y: x
         model_config = MagicMock()
         attn_mask = self.attention_mask_builder.get_attention_mask(model_config)
-        self.assertEqual(attn_mask.shape, (1, 128, 2048, 16))
+        self.assertEqual(attn_mask.shape, (1, self.max_seqlen // 16, self.max_seqlen, 16))
         self.assertEqual(attn_mask[0][-1][0][-1], torch.tensor(float("-inf"), dtype=torch.float16))
 
     @patch("torch_npu.npu_format_cast")
@@ -47,7 +48,7 @@ def test_get_swa_mask_310(self, mock_format_cast):
 
         sliding_window = 128
         swa_mask = self.attention_mask_builder.get_swa_mask(torch.float16, sliding_window)
-        self.assertEqual(swa_mask.shape, (1, 128, 2048, 16))
+        self.assertEqual(swa_mask.shape, (1, self.max_seqlen // 16, self.max_seqlen, 16))
         self.assertEqual(swa_mask[0][-1][0][-1], torch.tensor(float("-inf"), dtype=torch.float16))
         self.assertEqual(swa_mask[0][0][-1][0], torch.tensor(float("-inf"), dtype=torch.float16))
 
@@ -58,4 +59,4 @@ def test_get_splitfuse_attn_mask_310(self, mock_format_cast):
         attn_metadata.query_start_loc = torch.tensor([0, 1, 5])
         attn_metadata.seq_lens = torch.tensor([7, 4])
         attn_mask = self.attention_mask_builder.get_splitfuse_mask(attn_metadata, torch.device("cpu"))
-        self.assertEqual(attn_mask.shape, (1, 128, 16, 16))
+        self.assertEqual(attn_mask.shape, (1, self.max_seqlen // 16, 16, 16))
diff --git a/vllm_ascend/_310p/attention/attention_mask.py b/vllm_ascend/_310p/attention/attention_mask.py
@@ -24,19 +24,20 @@
 
 class AttentionMaskBuilder310:
     chunked_prefill_attn_mask = None
-    max_seqlen = 2048
+    max_seqlen = 16384
 
-    def __init__(self, device: torch.device):
+    def __init__(self, device: torch.device, max_seqlen: int):
         """
         Initializes the AttentionMaskBuilder for the 310P device.
 
         Args:
             device (torch.device): The device on which tensors will be allocated.
+            max_seqlen (int): Maximum length of a sequence (including prompt and generated text).
         """
+        AttentionMaskBuilder310.max_seqlen = max_seqlen
         self.attn_mask_cache = None
         self.device = device
         self.swa_mask = None
-        self._seq_len_cached = 0
 
     @staticmethod
     def gen_causal_additive_mask(max_seq_len: int, device: torch.device):
@@ -147,8 +148,7 @@ def _get_causal_mask(self, max_seq_len: int) -> torch.Tensor:
         Returns:
             torch.Tensor: The cached causal mask in ACL_FORMAT_FRACTAL_NZ.
         """
-        if self.attn_mask_cache is None or max_seq_len > self._seq_len_cached:
+        if self.attn_mask_cache is None:
             attn_mask = self.gen_causal_additive_mask(max_seq_len, self.device)
             self.attn_mask_cache = torch_npu.npu_format_cast(nd_to_nz_2d(attn_mask), ACL_FORMAT_FRACTAL_NZ)
-            self._seq_len_cached = max_seq_len
         return self.attn_mask_cache
diff --git a/vllm_ascend/_310p/attention/attention_v1.py b/vllm_ascend/_310p/attention/attention_v1.py
@@ -41,7 +41,6 @@ def __init__(self, *args, **kwargs):
         Initializes the 310P backend and sets up the device-specific mask builder.
         """
         super().__init__(*args, **kwargs)
-        self.attn_mask_builder = AttentionMaskBuilder310(self.device)
 
     @staticmethod
     def get_kv_cache_shape(num_blocks: int, block_size: int, num_kv_heads: int, head_size: int):

diff --git a/vllm_ascend/_310p/attention/metadata_builder.py b/vllm_ascend/_310p/attention/metadata_builder.py
@@ -54,4 +54,5 @@ def __init__(
         super().__init__(kv_cache_spec, layer_names, vllm_config, device)
 
         # Override the mask builder with the 310P-specific version
-        self.attn_mask_builder: Any = AttentionMaskBuilder310(self.device)
+        max_model_len = vllm_config.model_config.max_model_len
+        self.attn_mask_builder: Any = AttentionMaskBuilder310(self.device, max_model_len)
diff --git a/vllm_ascend/_310p/fused_moe/fused_moe.py b/vllm_ascend/_310p/fused_moe/fused_moe.py
@@ -27,6 +27,7 @@
 from vllm_ascend.ops.fused_moe.experts_selector import zero_experts_compute
 from vllm_ascend.ops.fused_moe.moe_comm_method import FusedExpertsResult, _MoECommMethods
 from vllm_ascend.quantization.methods.base import QuantType
+from vllm_ascend.utils import vllm_version_is
 
 from .experts_selector import select_experts
 from .moe_comm_method import AllGatherCommImpl310
@@ -153,6 +154,26 @@ def __init__(self, *args, **kwargs):
 
         _MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl310(self.moe_config)
 
+        if not vllm_version_is("0.15.0"):
+            self.runner = self._init_runner()
+
+    if not vllm_version_is("0.15.0"):
+
+        def _init_runner(self):
+            from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner
+
+            return AscendMoERunner(
+                layer=self,
+                moe_config=self.moe_config,
+                router=self.router,
+                routed_input_transform=self._routed_input_transform,
+                gate=self.gate,
+                shared_experts=self.shared_experts,
+                quant_method=self.quant_method,
+                reduce_results=self.reduce_results,
+                enable_dbo=self.vllm_config.parallel_config.enable_dbo,
+            )
+
     def init_experts_map(self, moe_config):
         """
         Initialize expert mapping for MoE (Mixture of Experts) model.