vllm-project · wangxiyuan · May 6, 2026 · May 1, 2026
@@ -80,7 +80,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8, v0.19.1]
+        vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
     uses: ./.github/workflows/_e2e_test.yaml
@@ -102,7 +102,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        vllm_version: [v0.19.1]
+        vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8]
     needs: [parse-trigger]
     if: ${{ needs.parse-trigger.outputs.allowed == 'true' }}
     uses: ./.github/workflows/_e2e_test.yaml

@@ -154,7 +154,7 @@ jobs:
     if: ${{ needs.lint.result == 'success' && needs.changes.outputs.has_tests == 'true' }}
     strategy:
       matrix:
-        vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8, v0.19.1]
+        vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8]
     uses: ./.github/workflows/_optional_smart_e2e.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
@@ -164,7 +164,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8, v0.19.1]
+        vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.

@@ -23,7 +23,7 @@ jobs:
     name: e2e-test
     strategy:
       matrix:
-        vllm_version: [v0.19.1]
+        vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8]
         type: [full, light]
     uses: ./.github/workflows/_e2e_test.yaml
     with:

@@ -45,7 +45,7 @@ jobs:
           fail-fast: false
           matrix:
             part: [0, 1, 2, 3]
-            vllm: [v0.19.1]
+            vllm: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8]
         container:
           image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11
           env:

@@ -48,8 +48,12 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.19.1
-RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
+# ARG VLLM_TAG=v0.19.1
+# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
+ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8
+RUN git init /vllm-workspace/vllm && \
+    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
+    git -C /vllm-workspace/vllm checkout FETCH_HEAD
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

@@ -33,8 +33,12 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.19.1
-RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
+# ARG VLLM_TAG=v0.19.1
+# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
+ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8
+RUN git init /vllm-workspace/vllm && \
+    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
+    git -C /vllm-workspace/vllm checkout FETCH_HEAD
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

@@ -32,8 +32,12 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.19.1
-RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
+# ARG VLLM_TAG=v0.19.1
+# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
+ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8
+RUN git init /vllm-workspace/vllm && \
+    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
+    git -C /vllm-workspace/vllm checkout FETCH_HEAD
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

@@ -50,8 +50,12 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.19.1
-RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
+# ARG VLLM_TAG=v0.19.1
+# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
+ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8
+RUN git init /vllm-workspace/vllm && \
+    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
+    git -C /vllm-workspace/vllm checkout FETCH_HEAD
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

@@ -49,8 +49,12 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.19.1
-RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
+# ARG VLLM_TAG=v0.19.1
+# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
+ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8
+RUN git init /vllm-workspace/vllm && \
+    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
+    git -C /vllm-workspace/vllm checkout FETCH_HEAD
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

@@ -49,8 +49,12 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.19.1
-RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
+# ARG VLLM_TAG=v0.19.1
+# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
+ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8
+RUN git init /vllm-workspace/vllm && \
+    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
+    git -C /vllm-workspace/vllm checkout FETCH_HEAD
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

@@ -62,7 +62,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
 
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  | Triton Ascend |
 |-------------|--------------|------------------|-------------|--------------------|---------------|
-|     main    | {{main_vllm_commit}}, {{main_vllm_tag}} | {{main_python_version}}   | {{main_cann_version}} | {{main_pytorch_torch_npu_version}} | {{main_triton_ascend_version}} |
+|     main    | {{main_vllm_commit}} | {{main_python_version}}   | {{main_cann_version}} | {{main_pytorch_torch_npu_version}} | {{main_triton_ascend_version}} |
 
 ## Release cadence
 

@@ -22,15 +22,13 @@
 from vllm import SamplingParams
 
 from tests.e2e.conftest import VllmRunner
-from vllm_ascend.utils import vllm_version_is
 
 MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
 
 MAIN_MODELS = ["LLM-Research/Meta-Llama-3.1-8B-Instruct"]
 EGALE_MODELS = ["vllm-ascend/EAGLE-LLaMA3.1-Instruct-8B"]
 
 
-@pytest.mark.skipif(vllm_version_is("0.19.1"), reason="no need to support model_runner for v0.19.1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("enforce_eager", [True])
@@ -65,7 +63,6 @@ def test_qwen3_dense_eager_mode(
         runner.model.generate(prompts, sampling_params)
 
 
-@pytest.mark.skipif(vllm_version_is("0.19.1"), reason="no need to support model_runner for v0.19.1")
 @pytest.mark.parametrize("model", MAIN_MODELS)
 @pytest.mark.parametrize("eagle_model", EGALE_MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
@@ -104,7 +101,6 @@ def test_egale_spec_decoding(
         runner.model.generate(prompts, sampling_params)
 
 
-@pytest.mark.skipif(vllm_version_is("0.19.1"), reason="no need to support model_runner for v0.19.1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("enforce_eager", [False])

@@ -625,11 +625,6 @@ def test_dflash_acceptance(
     method: str,
     num_speculative_tokens: int,
 ):
-    from vllm_ascend.utils import vllm_version_is
-
-    if vllm_version_is("0.19.1"):
-        pytest.skip("Dflash tests are not supported on vLLM version 0.19.1")
-
     main_model_name = DFLASH[method]["main"]
     spec_model_name = DFLASH[method]["spec"]
 

@@ -8,7 +8,6 @@
 
 from tests.ut.base import TestBase
 from vllm_ascend.ops.mla import AscendMultiHeadLatentAttention, IndexerWrapper
-from vllm_ascend.utils import vllm_version_is
 
 
 class TestIndexerWrapper(TestBase):
@@ -19,11 +18,7 @@ def test_initialization(self):
         mock_indexer.topk_tokens = 2048
         mock_indexer.q_lora_rank = 1536
         mock_indexer.wq_b = nn.Linear(128, 128)
-        if vllm_version_is("0.19.1"):
-            mock_indexer.wk = nn.Linear(128, 128)
-            mock_indexer.weights_proj = nn.Linear(128, 128)
-        else:
-            mock_indexer.wk_weights_proj = nn.Linear(128, 128)
+        mock_indexer.wk_weights_proj = nn.Linear(128, 128)
         mock_indexer.k_norm = nn.LayerNorm(128)
         mock_indexer.softmax_scale = 0.123
         mock_indexer.topk_indices_buffer = torch.randn(10)
@@ -36,11 +31,7 @@ def test_initialization(self):
         self.assertEqual(wrapper.topk_tokens, 2048)
         self.assertEqual(wrapper.q_lora_rank, 1536)
         self.assertIs(wrapper.wq_b, mock_indexer.wq_b)
-        if vllm_version_is("0.19.1"):
-            self.assertIs(wrapper.wk, mock_indexer.wk)
-            self.assertIs(wrapper.weights_proj, mock_indexer.weights_proj)
-        else:
-            self.assertIs(wrapper.wk_weights_proj, mock_indexer.wk_weights_proj)
+        self.assertIs(wrapper.wk_weights_proj, mock_indexer.wk_weights_proj)
         self.assertIs(wrapper.k_norm, mock_indexer.k_norm)
         self.assertEqual(wrapper.softmax_scale, 0.123)
 

diff --git a/vllm_ascend/_310p/fused_moe/fused_moe.py b/vllm_ascend/_310p/fused_moe/fused_moe.py
@@ -27,7 +27,6 @@
 from vllm_ascend.ops.fused_moe.moe_comm_method import FusedExpertsResult, _MoECommMethods
 from vllm_ascend.ops.fused_moe.moe_runtime_args import build_fused_experts_input
 from vllm_ascend.quantization.quant_type import QuantType
-from vllm_ascend.utils import vllm_version_is
 
 from .experts_selector import select_experts
 from .moe_comm_method import AllGatherCommImpl310
@@ -164,14 +163,13 @@ def __init__(self, *args, **kwargs):
 
         from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner
 
-        is_legacy = vllm_version_is("0.19.1")
         self.runner = AscendMoERunner(
-            self if is_legacy else self.layer_name,
+            self.layer_name,
             self.moe_config,
             self.router,
             self._routed_input_transform,
-            self.gate if is_legacy else kwargs.pop("gate", None),
-            self.shared_experts if is_legacy else kwargs.pop("shared_experts", None),
+            kwargs.pop("gate", None),
+            kwargs.pop("shared_experts", None),
             self.quant_method,
             self.reduce_results,
             self.vllm_config.parallel_config.enable_dbo,
@@ -285,9 +283,8 @@ def __init__(
         # which at this point is still the stale runner built with shared_experts=None.
         from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner
 
-        is_legacy = vllm_version_is("0.19.1")
         self.runner = AscendMoERunner(
-            self if is_legacy else self.layer_name,
+            self.layer_name,
             self.moe_config,
             self.router,
             self._routed_input_transform,

@@ -20,7 +20,6 @@
     is_drafter_moe_model,
     is_moe_model,
     speculative_enable_dispatch_gmm_combine_decode,
-    vllm_version_is,
 )
 
 
@@ -156,10 +155,7 @@ def set_ascend_forward_context(
         dp_world_size = get_dp_group().world_size
         if dp_world_size > 1 and forward_context.dp_metadata is not None:
             dp_meta = forward_context.dp_metadata
-            if vllm_version_is("0.19.1"):
-                max_tokens_across_dp = dp_meta.max_tokens_across_dp_cpu.item()
-            else:
-                max_tokens_across_dp = dp_meta.num_tokens_across_dp_cpu.max().item()
+            max_tokens_across_dp = dp_meta.num_tokens_across_dp_cpu.max().item()
             if forward_context.flash_comm_v1_enabled or forward_context.flashcomm_v2_enabled:
                 padded_length = (max_tokens_across_dp + tp_world_size - 1) // tp_world_size * tp_world_size
                 pad_size = padded_length - num_tokens

@@ -12,7 +12,6 @@
 from vllm_ascend.attention.sfa_v1 import AscendSFAImpl, AscendSFAMetadata, AscendSFAMetadataBuilder
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata, enabling_mlapo, split_decodes_and_prefills
 from vllm_ascend.ops.triton.rope import rope_forward_triton_siso
-from vllm_ascend.utils import vllm_version_is
 
 M = TypeVar("M", bound=AscendSFAMetadata)
 
@@ -414,12 +413,8 @@ def indexer_select_post_process(
         actual_seq_lengths_query: torch.Tensor,
         actual_seq_lengths_key: torch.Tensor,
     ):
-        if vllm_version_is("0.19.1"):
-            weights, _ = self.weights_proj(x)
-        else:
-            kw, _ = self.wk_weights_proj(x)
-            weights = kw[:, self.head_dim :]
-
+        kw, _ = self.wk_weights_proj(x)
+        weights = kw[:, self.head_dim :]
         q_li, _ = self.wq_b(q_c)  # [b,s,1536] @ [1536,64*128] = [b,s,64*128]
         q_li = q_li.view(-1, self.n_head, self.head_dim)  # [n_toks,64,128]
         if HAS_TRITON:

@@ -55,7 +55,6 @@
     enable_dsa_cp_with_o_proj_tp,
     get_weight_prefetch_method,
     maybe_trans_nz,
-    vllm_version_is,
 )
 from vllm_ascend.worker.npu_input_batch import NPUInputBatch
 
@@ -439,11 +438,7 @@ def __init__(
         self.n_head: int = self.indexer.n_head  # 64
         self.head_dim: int = self.indexer.head_dim  # 128
         self.wq_b = self.indexer.wq_b
-        if vllm_version_is("0.19.1"):
-            self.wk = self.indexer.wk
-            self.weights_proj = self.indexer.weights_proj
-        else:
-            self.wk_weights_proj = self.indexer.wk_weights_proj
+        self.wk_weights_proj = self.indexer.wk_weights_proj
         self.k_norm = self.indexer.k_norm
         self.cp_size = 1
         self.is_rope_neox_style = True
@@ -912,11 +907,8 @@ def indexer_select_pre_process(
         cos: torch.Tensor,
         sin: torch.Tensor,
     ):
-        if vllm_version_is("0.19.1"):
-            k_li, _ = self.wk(x)  # [b,s,7168] @ [7168,128] = [b,s,128]
-        else:
-            kw, _ = self.wk_weights_proj(x)
-            k_li = kw[:, : self.head_dim]
+        kw, _ = self.wk_weights_proj(x)
+        k_li = kw[:, : self.head_dim]
         k_li = self.k_norm(k_li).unsqueeze(1)
         k_li = k_li.view(-1, 1, self.head_dim)
 
@@ -961,12 +953,8 @@ def indexer_select_post_process(
         actual_seq_lengths_query: torch.Tensor,
         actual_seq_lengths_key: torch.Tensor,
     ):
-        if vllm_version_is("0.19.1"):
-            weights, _ = self.weights_proj(x)
-        else:
-            kw, _ = self.wk_weights_proj(x)
-            weights = kw[:, self.head_dim :]
-
+        kw, _ = self.wk_weights_proj(x)
+        weights = kw[:, self.head_dim :]
         q_li, _ = self.wq_b(q_c)  # [b,s,1536] @ [1536,64*128] = [b,s,64*128]
         q_li = q_li.view(-1, self.n_head, self.head_dim)  # [n_toks,64,128]
         if HAS_TRITON: