vllm-project · Potabk · May 11, 2026 · May 11, 2026 · May 12, 2026 · May 13, 2026
@@ -27,8 +27,10 @@ RUN apt-get update -y && \
 
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 # For lint purpose, actually we need make a main2main matching.
-ARG VLLM_TAG=v0.20.1
-RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
+ARG VLLM_COMMIT=4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef
+RUN git init /vllm-workspace/vllm && \
+    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
+    git -C /vllm-workspace/vllm checkout FETCH_HEAD
 
 # # Install vLLM common dependencies
 RUN python3 -m pip install -r /vllm-workspace/vllm/requirements/common.txt --extra-index https://download.pytorch.org/whl/cpu/ && \

@@ -80,7 +80,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1]
+        vllm_version: [4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef, v0.20.2]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
     uses: ./.github/workflows/_e2e_test.yaml
@@ -102,7 +102,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        vllm_version: [v0.20.1]
+        vllm_version: [v0.20.2]
     needs: [parse-trigger]
     if: ${{ needs.parse-trigger.outputs.allowed == 'true' }}
     uses: ./.github/workflows/_e2e_test.yaml

@@ -41,7 +41,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: c7aa186d67b6f051680831418e957c67f34ba7a2
+      vllm: 4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef
   changes:
     runs-on: linux-aarch64-a2b3-0
     container:
@@ -155,7 +155,7 @@ jobs:
     if: ${{ needs.lint.result == 'success' && needs.changes.outputs.has_tests == 'true' }}
     strategy:
       matrix:
-        vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1]
+        vllm_version: [4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef, v0.20.2]
     uses: ./.github/workflows/_optional_smart_e2e.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
@@ -165,7 +165,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1]
+        vllm_version: [4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef, v0.20.2]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.

@@ -4,12 +4,6 @@ on:
     # Runs at 00:00 UTC+8 every day
     - cron: '0 20 * * *' 
   workflow_dispatch:
-    inputs:
-      vllm_hash:
-        description: 'vLLM base hash'
-        default: main
-        required: true
-        type: string
   push:
     paths:
       - '.github/workflows/dockerfiles/Dockerfile.lint'
@@ -85,5 +79,3 @@ jobs:
         labels: ${{ steps.meta.outputs.labels }}
         tags: ${{ steps.meta.outputs.tags }}
         provenance: false
-        build-args: |
-          VLLM_HASH=${{ inputs.vllm_hash }}
@@ -23,7 +23,7 @@ jobs:
     name: e2e-test
     strategy:
       matrix:
-        vllm_version: [v0.20.1]
+        vllm_version: [v0.20.2]
         type: [full, light]
     uses: ./.github/workflows/_e2e_test.yaml
     with:

@@ -47,7 +47,7 @@ jobs:
           fail-fast: false
           matrix:
             part: [0, 1, 2, 3]
-            vllm: [v0.20.1]
+            vllm: [v0.20.2]
         container:
           image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:9.0.0-910b-ubuntu22.04-py3.11
           env:

@@ -48,7 +48,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.20.1
+ARG VLLM_TAG=v0.20.2
 RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

@@ -33,7 +33,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.20.1
+ARG VLLM_TAG=v0.20.2
 RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

@@ -32,7 +32,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.20.1
+ARG VLLM_TAG=v0.20.2
 RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

@@ -50,7 +50,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.20.1
+ARG VLLM_TAG=v0.20.2
 RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

@@ -49,7 +49,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.20.1
+ARG VLLM_TAG=v0.20.2
 RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

@@ -49,7 +49,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.20.1
+ARG VLLM_TAG=v0.20.2
 RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \

@@ -81,9 +81,9 @@
     # CANN image tag
     "cann_image_tag": "9.0.0-910b-ubuntu22.04-py3.11",
     # vLLM commit hash for main branch
-    "main_vllm_commit": "c7aa186d67b6f051680831418e957c67f34ba7a2",
+    "main_vllm_commit": "4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef",
     # vLLM tag for main branch
-    "main_vllm_tag": "v0.20.1",
+    "main_vllm_tag": "v0.20.2",
     # Python version for main branch
     "main_python_version": ">= 3.10, < 3.12",
     # CANN version for main branch

@@ -75,7 +75,7 @@ def test_qwen3_moe_distributed_aiv_tp2():
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
-@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1")
+@pytest.mark.skipif(vllm_version_is("0.20.2"), reason="no need to support model_runner for v0.20.2")
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [True])
 @patch.dict(os.environ, {"VLLM_USE_V2_MODEL_RUNNER": "1"})

@@ -19,6 +19,7 @@ def test_qwen3_moe_routing_replay():
         cudagraph_capture_sizes=[1, 2, 4, 8],
         distributed_executor_backend="mp",
         enable_return_routed_experts=True,
+        async_scheduling=False,
     ) as vllm_model:
         sampling_params = SamplingParams(
             max_tokens=5, temperature=0.8, top_p=0.95, output_kind=RequestOutputKind.FINAL_ONLY

@@ -65,7 +65,7 @@ def test_qwen3_dense_eager_mode(
         runner.model.generate(prompts, sampling_params)
 
 
-@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1")
+@pytest.mark.skipif(vllm_version_is("0.20.2"), reason="no need to support model_runner for v0.20.2")
 @pytest.mark.parametrize("model", MAIN_MODELS)
 @pytest.mark.parametrize("eagle_model", EGALE_MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
@@ -104,7 +104,7 @@ def test_egale_spec_decoding(
         runner.model.generate(prompts, sampling_params)
 
 
-@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1")
+@pytest.mark.skipif(vllm_version_is("0.20.2"), reason="no need to support model_runner for v0.20.2")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("enforce_eager", [False])

@@ -1385,10 +1385,13 @@ def check_mock(self):
             "method",
             "parallel_drafting",
             "draft_tensor_parallel_size",
-            "speculative_token_tree",
             "draft_model_config",
             "disable_padded_drafter_batch",
         }
+        # speculative_token_tree was removed in newer vllm (Remove tree attention #42121);
+        # only check for it when the installed version still carries the field.
+        if "speculative_token_tree" in vllm.config.SpeculativeConfig.__dataclass_fields__:
+            fields.add("speculative_token_tree")
 
         actual = set(vllm.config.SpeculativeConfig.__dataclass_fields__)
         missing = fields - actual
@@ -2260,10 +2263,13 @@ def check_mock(self):
             "enforce_eager",
             "use_local_argmax_reduction",
             "draft_tensor_parallel_size",
-            "speculative_token_tree",
             "draft_model_config",
             "disable_padded_drafter_batch",
         }
+        # speculative_token_tree was removed in newer vllm (Remove tree attention #42121);
+        # only check for it when the installed version still carries the field.
+        if "speculative_token_tree" in vllm.config.SpeculativeConfig.__dataclass_fields__:
+            fields.add("speculative_token_tree")
         actual = set(vllm.config.SpeculativeConfig.__dataclass_fields__)
         missing = fields - actual
         assert not missing, f"Missing dataclass fields: {missing}"

@@ -577,7 +577,7 @@ def schedule(self) -> SchedulerOutput:  # noqa: C901
                     num_encoder_tokens = sum(request.get_num_encoder_embeds(i) for i in encoder_inputs_to_schedule)
 
                 if (
-                    vllm_version_is("0.20.1")
+                    vllm_version_is("0.20.2")
                     and self.scheduler_reserve_full_isl
                     and not self.kv_cache_manager.can_fit_full_sequence(
                         request,
@@ -601,7 +601,7 @@ def schedule(self) -> SchedulerOutput:  # noqa: C901
                     delay_cache_blocks=load_kv_async,
                     num_encoder_tokens=num_encoder_tokens,
                     **(
-                        {} if vllm_version_is("0.20.1") else {"full_sequence_must_fit": self.scheduler_reserve_full_isl}
+                        {} if vllm_version_is("0.20.2") else {"full_sequence_must_fit": self.scheduler_reserve_full_isl}
                     ),
                 )
 

@@ -27,7 +27,6 @@
 from vllm.logger import logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map
-from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsCapturer
 from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner  # type: ignore
 
 import vllm_ascend.envs as envs_ascend
@@ -36,6 +35,7 @@
 from vllm_ascend.distributed.parallel_state import get_mc2_group
 from vllm_ascend.eplb.core.eplb_utils import init_eplb_config
 from vllm_ascend.flash_common3_context import get_flash_common3_context, set_flash_common3_context
+from vllm_ascend.ops.fused_moe import routed_experts_compat
 from vllm_ascend.ops.fused_moe.experts_selector import select_experts, zero_experts_compute
 from vllm_ascend.ops.fused_moe.moe_comm_method import AllGatherCommImpl, FusedExpertsResult, setup_moe_comm_method
 from vllm_ascend.ops.fused_moe.moe_runtime_args import build_fused_experts_input
@@ -159,12 +159,12 @@ def apply(
             num_experts=num_logical_experts,
         )
         if layer.vllm_config.model_config is not None and layer.vllm_config.model_config.enable_return_routed_experts:
-            capturer = RoutedExpertsCapturer.get_instance()
-            if capturer is not None:
-                capturer.capture(
-                    layer_id=layer.layer_id,
-                    topk_ids=topk_ids,
-                )
+            capturer = routed_experts_compat.get_capturer()
+            routed_experts_compat.call_capture(
+                capturer,
+                layer_id=layer.layer_id,
+                topk_ids=topk_ids,
+            )
 
         if zero_expert_num > 0 and zero_expert_type is not None:
             topk_ids, topk_weights, zero_expert_result = zero_experts_compute(

@@ -0,0 +1,120 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+"""Compatibility shim around vLLM's RoutedExpertsCapturer.
+- 0.20.2 exposed `RoutedExpertsCapturer.get_instance()` plus
+  `clear_buffer()` / `save_captured_experts(indices=...)` methods.
+- main moved to module-level helpers (`get_global_experts_capturer`,
+  `issue_routing_d2h_copy`, `extract_routed_experts_for_current_batch`,
+  `free_routing_buffers`, `init_routed_experts_capturer_with_shared_cache`).
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+import torch
+from vllm.model_executor.layers.fused_moe import routed_experts_capturer as _rec
+
+from vllm_ascend.utils import vllm_version_is
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+
+USE_LEGACY_API = vllm_version_is("0.20.2")
+
+
+def get_capturer():
+    """Return the global capturer instance, or None if not initialized."""
+    if USE_LEGACY_API:
+        return _rec.RoutedExpertsCapturer.get_instance()
+    return _rec.get_global_experts_capturer()
+
+
+def clear_step_buffers(scheduler_output: SchedulerOutput) -> None:
+    """Free per-request routing buffers for finished/preempted reqs.
+
+    main: `free_routing_buffers(finished, preempted)`.
+    0.20.2: `capturer.clear_buffer()` (full-buffer reset).
+    """
+    if USE_LEGACY_API:
+        capturer = get_capturer()
+        if capturer is not None:
+            capturer.clear_buffer()
+        return
+
+    _rec.free_routing_buffers(
+        scheduler_output.finished_req_ids,
+        getattr(scheduler_output, "preempted_req_ids", None),
+    )
+
+
+def issue_d2h_copy(
+    *,
+    input_batch_req_ids: list[str],
+    num_scheduled_tokens: dict[str, int],
+    positions: torch.Tensor,
+    positions_cpu: torch.Tensor | None,
+    legacy_indices: torch.Tensor | None = None,
+) -> None:
+    """Trigger the per-step D2H copy of routed experts.
+
+    main: `issue_routing_d2h_copy(...)` (async copy).
+    0.20.2: `capturer.save_captured_experts(indices=legacy_indices)`.
+    """
+    if USE_LEGACY_API:
+        capturer = get_capturer()
+        if capturer is not None:
+            capturer.save_captured_experts(indices=legacy_indices)
+        return
+
+    _rec.issue_routing_d2h_copy(
+        input_batch_req_ids=input_batch_req_ids,
+        num_scheduled_tokens=num_scheduled_tokens,
+        positions=positions,
+        positions_cpu=positions_cpu,
+    )
+
+
+def extract_for_current_batch(
+    *,
+    req_ids: list[str],
+    requests: dict,
+    req_id_to_index: dict[str, int],
+    num_tokens_no_spec: np.ndarray,
+    max_model_len: int,
+) -> dict[str, np.ndarray] | None:
+    """Pull routing data for requests finishing this step.
+
+    main: `extract_routed_experts_for_current_batch(...)`.
+    0.20.2: routing data flows through a different channel inside
+    `save_captured_experts`, so this returns None.
+    """
+    if USE_LEGACY_API:
+        return None
+    return _rec.extract_routed_experts_for_current_batch(
+        req_ids=req_ids,
+        requests=requests,
+        req_id_to_index=req_id_to_index,
+        num_tokens_no_spec=num_tokens_no_spec,
+        max_model_len=max_model_len,
+    )
+
+
+def call_capture(capturer, *, layer_id: int, topk_ids: torch.Tensor) -> None:
+    """Invoke `.capture(...)` on a capturer instance.
+
+    Both 0.20.2 and main expose `capture(layer_id, topk_ids)`, so this
+    is a thin pass-through kept for symmetry with the other helpers.
+    """
+    if capturer is None:
+        return
+    capturer.capture(layer_id=layer_id, topk_ids=topk_ids)