vllm-project · wangxiyuan · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026
@@ -27,7 +27,7 @@ RUN apt-get update -y && \
 
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 # For lint purpose, actually we need make a main2main matching.
-ARG VLLM_COMMIT=v0.19.0
+ARG VLLM_COMMIT=5af684c31912232e5c89484c2e8259e0fac6c55b
 RUN git init /vllm-workspace/vllm && \
     git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
     git -C /vllm-workspace/vllm checkout FETCH_HEAD

@@ -80,7 +80,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [v0.19.0]
+        vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
     uses: ./.github/workflows/_e2e_test.yaml

@@ -41,7 +41,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: v0.19.0
+      vllm: 5af684c31912232e5c89484c2e8259e0fac6c55b
   changes:
     runs-on: linux-aarch64-a2b3-0
     outputs:
@@ -91,7 +91,7 @@ jobs:
     if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
     strategy:
       matrix:
-        vllm_version: [v0.19.0]
+        vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
@@ -103,7 +103,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [v0.19.0]
+        vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.
@@ -116,4 +116,4 @@ jobs:
       type: light
     secrets:
       HW_OBS_AK: ${{ secrets.HW_OBS_AK }}
-      HW_OBS_SK: ${{ secrets.HW_OBS_SK }}
+      HW_OBS_SK: ${{ secrets.HW_OBS_SK }}
@@ -48,10 +48,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_COMMIT=v0.19.0
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.19.0
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

@@ -33,10 +33,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_COMMIT=v0.19.0
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.19.0
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

@@ -32,10 +32,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_COMMIT=v0.19.0
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.19.0
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

@@ -50,10 +50,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_COMMIT=v0.19.0
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.19.0
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

@@ -49,10 +49,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_COMMIT=v0.19.0
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.19.0
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

@@ -49,10 +49,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_COMMIT=v0.19.0
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.19.0
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

@@ -8,11 +8,23 @@
 
 import vllm_ascend.patch.worker.patch_gdn_attn as patch_gdn_attn
 from vllm.config.compilation import CUDAGraphMode
+from vllm.model_executor.layers.fla.ops import index as _fla_index
 from vllm.v1.attention.backend import CommonAttentionMetadata
 from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
 from vllm.v1.kv_cache_interface import MambaSpec
 
 
+@pytest.fixture(autouse=True)
+def _patch_triton_cdiv(monkeypatch):
+    if not hasattr(_fla_index.triton, "cdiv"):
+        monkeypatch.setattr(
+            _fla_index.triton,
+            "cdiv",
+            lambda a, b: (a + b - 1) // b,
+            raising=False,
+        )
+
+
 @dataclass
 class BatchSpec:
     seq_lens: list[int]

diff --git a/vllm_ascend/_310p/fused_moe/fused_moe.py b/vllm_ascend/_310p/fused_moe/fused_moe.py
@@ -27,6 +27,7 @@
 from vllm_ascend.ops.fused_moe.moe_comm_method import FusedExpertsResult, _MoECommMethods
 from vllm_ascend.ops.fused_moe.moe_runtime_args import build_fused_experts_input
 from vllm_ascend.quantization.quant_type import QuantType
+from vllm_ascend.utils import vllm_version_is
 
 from .experts_selector import select_experts
 from .moe_comm_method import AllGatherCommImpl310
@@ -36,6 +37,10 @@ class AscendUnquantizedFusedMoEMethod310(UnquantizedFusedMoEMethod):
     def __init__(self, moe: FusedMoEConfig = None):
         super().__init__(moe=moe)
 
+    @property
+    def is_monolithic(self) -> bool:
+        return False
+
     def process_weights_after_loading(self, layer):
         super().process_weights_after_loading(layer)
 
@@ -156,21 +161,20 @@ def __init__(self, *args, **kwargs):
         self.quant_type = self.get_quant_type()
 
         _MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl310(self.moe_config)
-        self.runner = self._init_runner()
 
-    def _init_runner(self):
         from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner
 
-        return AscendMoERunner(
-            layer=self,
-            moe_config=self.moe_config,
-            router=self.router,
-            routed_input_transform=self._routed_input_transform,
-            gate=self.gate,
-            shared_experts=self.shared_experts,
-            quant_method=self.quant_method,
-            reduce_results=self.reduce_results,
-            enable_dbo=self.vllm_config.parallel_config.enable_dbo,
+        is_legacy = vllm_version_is("0.19.0")
+        self.runner = AscendMoERunner(
+            self if is_legacy else self.layer_name,
+            self.moe_config,
+            self.router,
+            self._routed_input_transform,
+            self.gate if is_legacy else kwargs.pop("gate", None),
+            self.shared_experts if is_legacy else kwargs.pop("shared_experts", None),
+            self.quant_method,
+            self.reduce_results,
+            self.vllm_config.parallel_config.enable_dbo,
         )
 
     def init_experts_map(self, moe_config):
@@ -276,7 +280,23 @@ def __init__(
         self._gate = gate
         # Recreate runner after shared_experts/gate are set so custom op dispatch
         # goes through moe_forward_shared.
-        self.runner = self._init_runner()
+        # NOTE: must use self._shared_experts here, not self.shared_experts —
+        # FusedMoE.shared_experts is a property that reads self.runner.shared_experts,
+        # which at this point is still the stale runner built with shared_experts=None.
+        from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner
+
+        is_legacy = vllm_version_is("0.19.0")
+        self.runner = AscendMoERunner(
+            self if is_legacy else self.layer_name,
+            self.moe_config,
+            self.router,
+            self._routed_input_transform,
+            self.gate,
+            self._shared_experts,
+            self.quant_method,
+            self.reduce_results,
+            self.vllm_config.parallel_config.enable_dbo,
+        )
 
     @property
     def is_internal_router(self) -> bool:
@@ -288,20 +308,16 @@ def forward(
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        if self._shared_experts is None:
-            fused_out = AscendFusedMoE310.forward(
-                self,
-                hidden_states=hidden_states,
-                router_logits=router_logits,
-            )
-            shared_out = None
-            return shared_out, fused_out
-        shared_out, fused_out = AscendFusedMoE310.forward(
+        result = AscendFusedMoE310.forward(
             self,
             hidden_states=hidden_states,
             router_logits=router_logits,
         )
-        return shared_out, fused_out
+        # When shared experts are absent, the parent returns only fused_out;
+        # otherwise it returns a (shared_out, fused_out) tuple.
+        if self._shared_experts is None:
+            return None, result
+        return result
 
     def _forward_shared_experts(self, hidden_states: torch.Tensor):
         if self._shared_experts is None: