vllm-project · wangxiyuan · Feb 25, 2026 · Feb 12, 2026 · Feb 12, 2026 · Feb 12, 2026
@@ -37,7 +37,7 @@ jobs:
     steps:
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=9562912cead1f11e8540fb91306c5cbda66f0007
+          VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
 
       - name: Checkout repository

@@ -27,7 +27,7 @@ RUN apt-get update -y && \
 
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 # For lint purpose, actually we need make a main2main matching.
-ARG VLLM_COMMIT=9562912cead1f11e8540fb91306c5cbda66f0007
+ARG VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd
 RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
     cd /vllm-workspace/vllm && \
     git checkout $VLLM_COMMIT

@@ -75,7 +75,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0]
+        vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
     uses: ./.github/workflows/_e2e_test.yaml

@@ -41,7 +41,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: 9562912cead1f11e8540fb91306c5cbda66f0007
+      vllm: 83b47f67b1dfad505606070ae4d9f83e50ad4ebd
   changes:
     runs-on: linux-aarch64-a2b3-0
     outputs:
@@ -87,7 +87,7 @@ jobs:
     if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
     strategy:
       matrix:
-        vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0]
+        vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
@@ -99,7 +99,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0]
+        vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.

@@ -33,7 +33,7 @@ jobs:
     name: refresh codecov
     strategy:
       matrix:
-        vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007]
+        vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}

@@ -56,7 +56,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
 
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  |
 |-------------|--------------|------------------|-------------|--------------------|
-|     main    | 9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0 tag | >= 3.10, < 3.12   | 8.5.0 | 2.9.0 / 2.9.0 |
+|     main    | 83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0 tag | >= 3.10, < 3.12   | 8.5.0 | 2.9.0 / 2.9.0 |
 
 ## Release cadence
 

@@ -13,6 +13,7 @@ setuptools>=64
 setuptools-scm>=8
 torch==2.9.0
 torchvision
+torchaudio
 wheel
 xgrammar>=0.1.30
 pandas-stubs

@@ -25,22 +25,35 @@ def setUp(self, mock_fix_incompatible_config):
         if vllm_version_is("0.15.0"):
             moe_parallel_config = FusedMoEParallelConfig(
                 2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True)
+            moe_config = FusedMoEConfig(
+                num_experts=8,
+                experts_per_token=8,
+                hidden_dim=8192,
+                intermediate_size_per_partition=5,
+                num_local_experts=8,
+                activation="silu",
+                device="npu",
+                routing_method=RoutingMethodType.Simulated,
+                moe_parallel_config=moe_parallel_config,
+                in_dtype=torch.float16,
+            )
         else:
             moe_parallel_config = FusedMoEParallelConfig(
-                2, 0, 1, 2, 1, 1, 1, 1, True, "hccl",
-                is_sequence_parallel=False, enable_eplb=True)
-        moe_config = FusedMoEConfig(
-            num_experts=8,
-            experts_per_token=8,
-            hidden_dim=8192,
-            intermediate_size_per_partition=5,
-            num_local_experts=8,
-            activation="silu",
-            device="npu",
-            routing_method=RoutingMethodType.Simulated,
-            moe_parallel_config=moe_parallel_config,
-            in_dtype=torch.float16,
-        )
+                2, 0, 1, 2, 1, 1, 1, 1, 1, True, "hccl",
+                enable_eplb=True)
+            moe_config = FusedMoEConfig(
+                num_experts=8,
+                experts_per_token=8,
+                hidden_dim=8192,
+                intermediate_size_per_partition=5,
+                num_local_experts=8,
+                num_logical_experts=8,
+                activation="silu",
+                device="npu",
+                routing_method=RoutingMethodType.Simulated,
+                moe_parallel_config=moe_parallel_config,
+                in_dtype=torch.float16,
+            )
         moe_config.supports_eplb = True
         self.vllm_config = vllm_config
         self.moe_config = moe_config

diff --git a/vllm_ascend/_310p/model_runner_310p.py b/vllm_ascend/_310p/model_runner_310p.py
@@ -236,22 +236,22 @@ def _prepare_input_ids(
                 prev_draft_token_indices.extend(range(start, start + draft_len))
                 indices_match &= prev_index == flattened_index
                 max_flattened_index = max(max_flattened_index, flattened_index)
-        num_commmon_tokens = len(sample_flattened_indices)
+        num_common_tokens = len(sample_flattened_indices)
         total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens
-        if num_commmon_tokens < total_without_spec:
+        if num_common_tokens < total_without_spec:
             self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
             if self.enable_prompt_embeds:
                 self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
                 self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
-        if num_commmon_tokens == 0:
+        if num_common_tokens == 0:
             return
-        if indices_match and max_flattened_index == (num_commmon_tokens - 1):
+        if indices_match and max_flattened_index == (num_common_tokens - 1):
             # NOTE: Override the copy_ function here
-            indices = torch.arange(num_commmon_tokens, device=self.input_ids.gpu.device)
-            source = self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, 0]
+            indices = torch.arange(num_common_tokens, device=self.input_ids.gpu.device)
+            source = self.input_batch.prev_sampled_token_ids[:num_common_tokens, 0]
             self.input_ids.gpu.index_copy_(0, indices, source)
             if self.enable_prompt_embeds:
-                self.is_token_ids.gpu[:num_commmon_tokens] = True
+                self.is_token_ids.gpu[:num_common_tokens] = True
             return
         # Upload the index tensors asynchronously so the scatter can be non-blocking.
         sampled_tokens_index_tensor = torch.tensor(

@@ -28,6 +28,13 @@
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map
 from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
 
+from vllm_ascend.utils import vllm_version_is
+
+if not vllm_version_is("0.15.0"):
+    from vllm.model_executor.layers.fused_moe.fused_moe_method_base import FusedMoEMethodBase  # type: ignore
+    from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter  # type: ignore
+    from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner  # type: ignore
+
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.distributed.parallel_state import get_mc2_group
@@ -154,6 +161,77 @@ def apply(
         return final_hidden_states
 
 
+if not vllm_version_is("0.15.0"):
+    # Please remove this inheritance after extending vllm, todo(wxs)
+    class AscendMoERunner(DefaultMoERunner):
+        """
+        Default implementation of the MoE runner for executing Mixture of Experts layers.
+
+        This class provides a comprehensive implementation for running MoE computations
+        with support for:
+        - Expert routing and token dispatching
+        - Shared experts computation with optional parallel execution using CUDA streams
+        - Data parallel (DP) chunking for large batch processing
+        - Tensor model parallel and expert parallel operations
+        - Various quantization methods and custom operators
+        - Both monolithic and decomposed expert execution paths
+
+        The runner handles the complete MoE forward pass including routing tokens to
+        experts, executing expert computations, and combining results. It supports
+        advanced features like overlapped execution of shared experts and optimized
+        kernels for different parallel execution modes.
+
+        Eventually, this class will be split up and specialized for different
+        configurations, e.g. the presence or absence of shared experts, a gate, etc.
+        """
+
+        def __init__(
+            self,
+            layer: torch.nn.Module,
+            moe_config: FusedMoEConfig,
+            router: FusedMoERouter,
+            routed_input_transform: torch.nn.Module | None,
+            gate: torch.nn.Module | None,
+            shared_experts: torch.nn.Module | None,
+            quant_method: FusedMoEMethodBase,
+            reduce_results: bool,
+            enable_dbo: bool,
+        ):
+            super().__init__(
+                layer,
+                moe_config,
+                router,
+                routed_input_transform,
+                gate,
+                shared_experts,
+                quant_method,
+                reduce_results,
+                enable_dbo,
+            )
+            if self.shared_experts is None:
+                self.moe_forward = torch.ops.vllm.moe_forward
+            else:
+                self.moe_forward = torch.ops.vllm.moe_forward_shared
+
+        def forward_impl(
+            self,
+            layer: torch.nn.Module,
+            hidden_states: torch.Tensor,
+            router_logits: torch.Tensor,
+            shared_input: torch.Tensor | None,
+        ):
+            """
+            Override the default forward_impl to use Ascend-specific implementation.
+            This delegates to the layer's forward_impl method which contains the
+            Ascend-specific MoE computation logic.
+            """
+            result = layer.forward_impl(hidden_states, router_logits)
+            # If the layer has shared experts, forward_impl returns a tuple (shared_out, routed_out)
+            # Otherwise, it returns just routed_out
+            # The torch op expects the same return type based on whether it's moe_forward or moe_forward_shared
+            return result
+
+
 class AscendFusedMoE(FusedMoE):
     moe_counter = -1
     gate_stream: torch.npu.Stream | None = None
@@ -237,6 +315,26 @@ def __init__(self, *args, **kwargs):
 
         setup_moe_comm_method(self.moe_config)
         self.quant_type = self._get_quant_type()
+        if not vllm_version_is("0.15.0"):
+            self.runner = self._init_runner()
+
+    if not vllm_version_is("0.15.0"):
+
+        def _init_runner(self):
+            # Storing the runner in the FusedMoE is an intermediate state, eventually
+            # the runner will own the FusedMoE layer and provide the execution interface
+            # for MoE ops.
+            return AscendMoERunner(
+                layer=self,
+                moe_config=self.moe_config,
+                router=self.router,
+                routed_input_transform=self._routed_input_transform,
+                gate=self.gate,
+                shared_experts=self.shared_experts,
+                quant_method=self.quant_method,
+                reduce_results=self.reduce_results,
+                enable_dbo=self.vllm_config.parallel_config.enable_dbo,
+            )
 
     def _get_quant_type(self) -> QuantType:
         quant_type = QuantType.NONE
@@ -266,6 +364,19 @@ def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tens
         """
         return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(final_hidden_states)
 
+    if not vllm_version_is("0.15.0"):
+
+        def forward(
+            self,
+            hidden_states: torch.Tensor,
+            router_logits: torch.Tensor,
+        ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+            self.ensure_moe_quant_config_init()
+            return self.runner.forward(
+                hidden_states,
+                router_logits,
+            )
+
     def forward_impl(  # type: ignore[override]
         self, hidden_states: torch.Tensor, router_logits: torch.Tensor, return_with_event: bool = False
     ) -> torch.Tensor | FusedMoEResult:
@@ -414,6 +525,10 @@ def __init__(
             logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.")
 
         self._gate = gate
+        if not vllm_version_is("0.15.0"):
+            # Recreate the runner with the correct shared_experts parameter
+            # The parent class created the runner before self._shared_experts was set
+            self.runner = self._init_runner()
 
         if self.multistream_overlap_shared_expert:
             # Wrap the quant_method's process_weights_after_loading to validate that

@@ -524,6 +524,13 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
             "increase the number of supported shapes, set HCCL_OP_EXPANSION_MODE=AIV."
         )
 
+    from vllm_ascend.utils import vllm_version_is
+
+    if vllm_version_is("0.15.0"):
+        arch_name = vllm_config.model_config.architectures[0]
+    else:
+        arch_name = vllm_config.model_config.architecture
+
     # If original sizes exceed maximum, sample a representative subset
     if max_num_batch_sizes < len(original_sizes):
         # Sample uniformly from original sizes
@@ -535,10 +542,9 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
 
         sampled_sizes = [original_sizes[i] for i in indices]
         update_cudagraph_capture_sizes(vllm_config, sampled_sizes)
-
         logger.info(
             "Adjusted ACL graph batch sizes for %s model (layers: %d): %d → %d sizes",
-            vllm_config.model_config.architectures[0],
+            arch_name,
             num_hidden_layers,
             len(original_sizes),
             len(
@@ -550,7 +556,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
         compilation_config.cudagraph_capture_sizes = original_sizes
         logger.info(
             "No adjustment needed for ACL graph batch sizes: %s model (layers: %d) with %d sizes",
-            vllm_config.model_config.architectures[0],
+            arch_name,
             num_hidden_layers,
             len(original_sizes),
         )