From e6a001e5d9178b20d347ae0c3402bb3b1a61503d Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Mon, 11 May 2026 15:04:47 +0800
Subject: [PATCH 01/19] upgrade vllm to 0511

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 .github/workflows/dockerfiles/Dockerfile.lint    | 6 ++++--
 .github/workflows/pr_test_full.yaml              | 2 +-
 .github/workflows/pr_test_light.yaml             | 6 +++---
 .github/workflows/schedule_lint_image_build.yaml | 8 --------
 docs/source/conf.py                              | 2 +-
 5 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint
index 1cbc47ba549..bacf219e1d6 100644
--- a/.github/workflows/dockerfiles/Dockerfile.lint
+++ b/.github/workflows/dockerfiles/Dockerfile.lint
@@ -27,8 +27,10 @@ RUN apt-get update -y && \
 
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 # For lint purpose, actually we need make a main2main matching.
-ARG VLLM_TAG=v0.20.1
-RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
+ARG VLLM_COMMIT=05d610e5cdb710bffdccdda630b4eb0f79afd76d
+RUN git init /vllm-workspace/vllm && \
+    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
+    git -C /vllm-workspace/vllm checkout FETCH_HEAD
 
 # # Install vLLM common dependencies
 RUN python3 -m pip install -r /vllm-workspace/vllm/requirements/common.txt --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml
index b308103072f..0b5a5313418 100644
--- a/.github/workflows/pr_test_full.yaml
+++ b/.github/workflows/pr_test_full.yaml
@@ -80,7 +80,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1]
+        vllm_version: [05d610e5cdb710bffdccdda630b4eb0f79afd76d, v0.20.1]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
     uses: ./.github/workflows/_e2e_test.yaml
diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml
index 6436962fef1..b35212d5559 100644
--- a/.github/workflows/pr_test_light.yaml
+++ b/.github/workflows/pr_test_light.yaml
@@ -41,7 +41,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: c7aa186d67b6f051680831418e957c67f34ba7a2
+      vllm: 05d610e5cdb710bffdccdda630b4eb0f79afd76d
   changes:
     runs-on: linux-aarch64-a2b3-0
     container:
@@ -155,7 +155,7 @@ jobs:
     if: ${{ needs.lint.result == 'success' && needs.changes.outputs.has_tests == 'true' }}
     strategy:
       matrix:
-        vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1]
+        vllm_version: [05d610e5cdb710bffdccdda630b4eb0f79afd76d, v0.20.1]
     uses: ./.github/workflows/_optional_smart_e2e.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
@@ -165,7 +165,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1]
+        vllm_version: [05d610e5cdb710bffdccdda630b4eb0f79afd76d, v0.20.1]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.
diff --git a/.github/workflows/schedule_lint_image_build.yaml b/.github/workflows/schedule_lint_image_build.yaml
index bf114e46d2d..36e012ff31f 100644
--- a/.github/workflows/schedule_lint_image_build.yaml
+++ b/.github/workflows/schedule_lint_image_build.yaml
@@ -4,12 +4,6 @@ on:
     # Runs at 00:00 UTC+8 every day
     - cron: '0 20 * * *' 
   workflow_dispatch:
-    inputs:
-      vllm_hash:
-        description: 'vLLM base hash'
-        default: main
-        required: true
-        type: string
   push:
     paths:
       - '.github/workflows/dockerfiles/Dockerfile.lint'
@@ -85,5 +79,3 @@ jobs:
         labels: ${{ steps.meta.outputs.labels }}
         tags: ${{ steps.meta.outputs.tags }}
         provenance: false
-        build-args: |
-          VLLM_HASH=${{ inputs.vllm_hash }}
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 6e31082413c..51a6497cb71 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -81,7 +81,7 @@
     # CANN image tag
     "cann_image_tag": "9.0.0-910b-ubuntu22.04-py3.11",
     # vLLM commit hash for main branch
-    "main_vllm_commit": "c7aa186d67b6f051680831418e957c67f34ba7a2",
+    "main_vllm_commit": "05d610e5cdb710bffdccdda630b4eb0f79afd76d",
     # vLLM tag for main branch
     "main_vllm_tag": "v0.20.1",
     # Python version for main branch

From aea68398496225eece825d2607f72d003aa32f70 Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Mon, 11 May 2026 15:11:05 +0800
Subject: [PATCH 02/19] continue on error

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 .github/workflows/_e2e_test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml
index eac9ea73826..39bf9f1966f 100644
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -27,7 +27,7 @@ on:
       continue_on_error:
         required: false
         type: boolean
-        default: false
+        default: true
       # The following inputs are used by comment-triggered E2E tests (/e2e <tests>).
       # They carry space-separated pytest paths, categorized by runner type.
       # Leave empty (default) when running label-triggered full/light suites.

From 84552c3144d725b04d0e44e28d14cb9342d9896a Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Tue, 12 May 2026 10:02:20 +0800
Subject: [PATCH 03/19] fix

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 tests/ut/spec_decode/test_eagle_proposer.py          | 10 ++++++++--
 vllm_ascend/ops/gdn.py                               |  1 +
 vllm_ascend/patch/platform/patch_balance_schedule.py |  2 ++
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/tests/ut/spec_decode/test_eagle_proposer.py b/tests/ut/spec_decode/test_eagle_proposer.py
index fd82351b726..3fc67227bd6 100644
--- a/tests/ut/spec_decode/test_eagle_proposer.py
+++ b/tests/ut/spec_decode/test_eagle_proposer.py
@@ -1385,10 +1385,13 @@ def check_mock(self):
             "method",
             "parallel_drafting",
             "draft_tensor_parallel_size",
-            "speculative_token_tree",
             "draft_model_config",
             "disable_padded_drafter_batch",
         }
+        # speculative_token_tree was removed in newer vllm (Remove tree attention #42121);
+        # only check for it when the installed version still carries the field.
+        if "speculative_token_tree" in vllm.config.SpeculativeConfig.__dataclass_fields__:
+            fields.add("speculative_token_tree")
 
         actual = set(vllm.config.SpeculativeConfig.__dataclass_fields__)
         missing = fields - actual
@@ -2260,10 +2263,13 @@ def check_mock(self):
             "enforce_eager",
             "use_local_argmax_reduction",
             "draft_tensor_parallel_size",
-            "speculative_token_tree",
             "draft_model_config",
             "disable_padded_drafter_batch",
         }
+        # speculative_token_tree was removed in newer vllm (Remove tree attention #42121);
+        # only check for it when the installed version still carries the field.
+        if "speculative_token_tree" in vllm.config.SpeculativeConfig.__dataclass_fields__:
+            fields.add("speculative_token_tree")
         actual = set(vllm.config.SpeculativeConfig.__dataclass_fields__)
         missing = fields - actual
         assert not missing, f"Missing dataclass fields: {missing}"
diff --git a/vllm_ascend/ops/gdn.py b/vllm_ascend/ops/gdn.py
index ef061f2316e..5254d7bc27a 100644
--- a/vllm_ascend/ops/gdn.py
+++ b/vllm_ascend/ops/gdn.py
@@ -129,6 +129,7 @@ def forward(
             b,
             a,
             core_attn_out,
+            False,
             self.prefix,
         )
 
diff --git a/vllm_ascend/patch/platform/patch_balance_schedule.py b/vllm_ascend/patch/platform/patch_balance_schedule.py
index 5711352aff3..b509fa14af4 100644
--- a/vllm_ascend/patch/platform/patch_balance_schedule.py
+++ b/vllm_ascend/patch/platform/patch_balance_schedule.py
@@ -32,6 +32,7 @@ def __init__(
         kv_cache_config: KVCacheConfig,
         structured_output_manager: StructuredOutputManager,
         block_size: int,
+        hash_block_size: int | None = None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         include_finished_set: bool = False,
         log_stats: bool = False,
@@ -41,6 +42,7 @@ def __init__(
             kv_cache_config,
             structured_output_manager,
             block_size,
+            hash_block_size,
             mm_registry,
             include_finished_set,
             log_stats,

From a3296637a069d6e64eaf117010c8a218e259df4d Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Wed, 13 May 2026 11:19:03 +0800
Subject: [PATCH 04/19] fix

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 .github/workflows/dockerfiles/Dockerfile.lint           | 2 +-
 .github/workflows/pr_test_full.yaml                     | 4 ++--
 .github/workflows/pr_test_light.yaml                    | 6 +++---
 .github/workflows/schedule_update_estimated_time.yaml   | 2 +-
 .github/workflows/schedule_vllm_e2e_test.yaml           | 2 +-
 Dockerfile                                              | 2 +-
 Dockerfile.310p                                         | 2 +-
 Dockerfile.310p.openEuler                               | 2 +-
 Dockerfile.a3                                           | 2 +-
 Dockerfile.a3.openEuler                                 | 2 +-
 Dockerfile.openEuler                                    | 2 +-
 docs/source/conf.py                                     | 4 ++--
 tests/e2e/multicard/2-cards/test_qwen3_moe.py           | 2 +-
 tests/e2e/singlecard/model_runner_v2/test_basic.py      | 4 ++--
 vllm_ascend/core/scheduler_profiling_chunk.py           | 4 ++--
 vllm_ascend/patch/platform/patch_mla_prefill_backend.py | 2 +-
 vllm_ascend/patch/worker/patch_v2/patch_triton.py       | 2 +-
 vllm_ascend/worker/v2/sample/logprob.py                 | 2 +-
 18 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint
index bacf219e1d6..a362a0b269a 100644
--- a/.github/workflows/dockerfiles/Dockerfile.lint
+++ b/.github/workflows/dockerfiles/Dockerfile.lint
@@ -27,7 +27,7 @@ RUN apt-get update -y && \
 
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 # For lint purpose, actually we need make a main2main matching.
-ARG VLLM_COMMIT=05d610e5cdb710bffdccdda630b4eb0f79afd76d
+ARG VLLM_COMMIT=4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef
 RUN git init /vllm-workspace/vllm && \
     git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
     git -C /vllm-workspace/vllm checkout FETCH_HEAD
diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml
index 0b5a5313418..b02387cdf6f 100644
--- a/.github/workflows/pr_test_full.yaml
+++ b/.github/workflows/pr_test_full.yaml
@@ -80,7 +80,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [05d610e5cdb710bffdccdda630b4eb0f79afd76d, v0.20.1]
+        vllm_version: [4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef, v0.20.2]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
     uses: ./.github/workflows/_e2e_test.yaml
@@ -102,7 +102,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        vllm_version: [v0.20.1]
+        vllm_version: [v0.20.2]
     needs: [parse-trigger]
     if: ${{ needs.parse-trigger.outputs.allowed == 'true' }}
     uses: ./.github/workflows/_e2e_test.yaml
diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml
index b35212d5559..50fe8e79f29 100644
--- a/.github/workflows/pr_test_light.yaml
+++ b/.github/workflows/pr_test_light.yaml
@@ -41,7 +41,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: 05d610e5cdb710bffdccdda630b4eb0f79afd76d
+      vllm: 4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef
   changes:
     runs-on: linux-aarch64-a2b3-0
     container:
@@ -155,7 +155,7 @@ jobs:
     if: ${{ needs.lint.result == 'success' && needs.changes.outputs.has_tests == 'true' }}
     strategy:
       matrix:
-        vllm_version: [05d610e5cdb710bffdccdda630b4eb0f79afd76d, v0.20.1]
+        vllm_version: [4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef, v0.20.2]
     uses: ./.github/workflows/_optional_smart_e2e.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
@@ -165,7 +165,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [05d610e5cdb710bffdccdda630b4eb0f79afd76d, v0.20.1]
+        vllm_version: [4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef, v0.20.2]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.
diff --git a/.github/workflows/schedule_update_estimated_time.yaml b/.github/workflows/schedule_update_estimated_time.yaml
index 904472e03bb..f72befae4b8 100644
--- a/.github/workflows/schedule_update_estimated_time.yaml
+++ b/.github/workflows/schedule_update_estimated_time.yaml
@@ -23,7 +23,7 @@ jobs:
     name: e2e-test
     strategy:
       matrix:
-        vllm_version: [v0.20.1]
+        vllm_version: [v0.20.2]
         type: [full, light]
     uses: ./.github/workflows/_e2e_test.yaml
     with:
diff --git a/.github/workflows/schedule_vllm_e2e_test.yaml b/.github/workflows/schedule_vllm_e2e_test.yaml
index c5fad253cce..805553bb2c6 100644
--- a/.github/workflows/schedule_vllm_e2e_test.yaml
+++ b/.github/workflows/schedule_vllm_e2e_test.yaml
@@ -47,7 +47,7 @@ jobs:
           fail-fast: false
           matrix:
             part: [0, 1, 2, 3]
-            vllm: [v0.20.1]
+            vllm: [v0.20.2]
         container:
           image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:9.0.0-910b-ubuntu22.04-py3.11
           env:
diff --git a/Dockerfile b/Dockerfile
index 2c765c67e2c..2a8899df590 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -48,7 +48,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.20.1
+ARG VLLM_TAG=v0.20.2
 RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/Dockerfile.310p b/Dockerfile.310p
index 20383636aa2..8252a25116e 100644
--- a/Dockerfile.310p
+++ b/Dockerfile.310p
@@ -33,7 +33,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.20.1
+ARG VLLM_TAG=v0.20.2
 RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler
index 77bc9b97103..72ef45c6f0e 100644
--- a/Dockerfile.310p.openEuler
+++ b/Dockerfile.310p.openEuler
@@ -32,7 +32,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.20.1
+ARG VLLM_TAG=v0.20.2
 RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/Dockerfile.a3 b/Dockerfile.a3
index 5ad57fe4b61..b740836546e 100644
--- a/Dockerfile.a3
+++ b/Dockerfile.a3
@@ -50,7 +50,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.20.1
+ARG VLLM_TAG=v0.20.2
 RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler
index 13f3e40f413..7167450d419 100644
--- a/Dockerfile.a3.openEuler
+++ b/Dockerfile.a3.openEuler
@@ -49,7 +49,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.20.1
+ARG VLLM_TAG=v0.20.2
 RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler
index 026f88dd4e8..da6f1c98b7e 100644
--- a/Dockerfile.openEuler
+++ b/Dockerfile.openEuler
@@ -49,7 +49,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.20.1
+ARG VLLM_TAG=v0.20.2
 RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 51a6497cb71..16e84286b03 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -81,9 +81,9 @@
     # CANN image tag
     "cann_image_tag": "9.0.0-910b-ubuntu22.04-py3.11",
     # vLLM commit hash for main branch
-    "main_vllm_commit": "05d610e5cdb710bffdccdda630b4eb0f79afd76d",
+    "main_vllm_commit": "4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef",
     # vLLM tag for main branch
-    "main_vllm_tag": "v0.20.1",
+    "main_vllm_tag": "v0.20.2",
     # Python version for main branch
     "main_python_version": ">= 3.10, < 3.12",
     # CANN version for main branch
diff --git a/tests/e2e/multicard/2-cards/test_qwen3_moe.py b/tests/e2e/multicard/2-cards/test_qwen3_moe.py
index 1a87a9b3a1b..7a9f7cef927 100644
--- a/tests/e2e/multicard/2-cards/test_qwen3_moe.py
+++ b/tests/e2e/multicard/2-cards/test_qwen3_moe.py
@@ -75,7 +75,7 @@ def test_qwen3_moe_distributed_aiv_tp2():
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
-@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1")
+@pytest.mark.skipif(vllm_version_is("0.20.2"), reason="no need to support model_runner for v0.20.2")
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [True])
 @patch.dict(os.environ, {"VLLM_USE_V2_MODEL_RUNNER": "1"})
diff --git a/tests/e2e/singlecard/model_runner_v2/test_basic.py b/tests/e2e/singlecard/model_runner_v2/test_basic.py
index baad54d8fab..c6b4f245a04 100644
--- a/tests/e2e/singlecard/model_runner_v2/test_basic.py
+++ b/tests/e2e/singlecard/model_runner_v2/test_basic.py
@@ -65,7 +65,7 @@ def test_qwen3_dense_eager_mode(
         runner.model.generate(prompts, sampling_params)
 
 
-@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1")
+@pytest.mark.skipif(vllm_version_is("0.20.2"), reason="no need to support model_runner for v0.20.2")
 @pytest.mark.parametrize("model", MAIN_MODELS)
 @pytest.mark.parametrize("eagle_model", EGALE_MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
@@ -104,7 +104,7 @@ def test_egale_spec_decoding(
         runner.model.generate(prompts, sampling_params)
 
 
-@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1")
+@pytest.mark.skipif(vllm_version_is("0.20.2"), reason="no need to support model_runner for v0.20.2")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("enforce_eager", [False])
diff --git a/vllm_ascend/core/scheduler_profiling_chunk.py b/vllm_ascend/core/scheduler_profiling_chunk.py
index b79aae83ff0..3545fbf9224 100644
--- a/vllm_ascend/core/scheduler_profiling_chunk.py
+++ b/vllm_ascend/core/scheduler_profiling_chunk.py
@@ -577,7 +577,7 @@ def schedule(self) -> SchedulerOutput:  # noqa: C901
                     num_encoder_tokens = sum(request.get_num_encoder_embeds(i) for i in encoder_inputs_to_schedule)
 
                 if (
-                    vllm_version_is("0.20.1")
+                    vllm_version_is("0.20.2")
                     and self.scheduler_reserve_full_isl
                     and not self.kv_cache_manager.can_fit_full_sequence(
                         request,
@@ -601,7 +601,7 @@ def schedule(self) -> SchedulerOutput:  # noqa: C901
                     delay_cache_blocks=load_kv_async,
                     num_encoder_tokens=num_encoder_tokens,
                     **(
-                        {} if vllm_version_is("0.20.1") else {"full_sequence_must_fit": self.scheduler_reserve_full_isl}
+                        {} if vllm_version_is("0.20.2") else {"full_sequence_must_fit": self.scheduler_reserve_full_isl}
                     ),
                 )
 
diff --git a/vllm_ascend/patch/platform/patch_mla_prefill_backend.py b/vllm_ascend/patch/platform/patch_mla_prefill_backend.py
index c904575cb3f..75615ed9a43 100644
--- a/vllm_ascend/patch/platform/patch_mla_prefill_backend.py
+++ b/vllm_ascend/patch/platform/patch_mla_prefill_backend.py
@@ -17,7 +17,7 @@
 
 from vllm_ascend.utils import vllm_version_is
 
-if not vllm_version_is("0.20.1"):
+if not vllm_version_is("0.20.2"):
     from vllm.v1.attention.backends.mla.prefill.base import MLAPrefillBackend
 
     class AscendMLAPrefillBackend(MLAPrefillBackend):
diff --git a/vllm_ascend/patch/worker/patch_v2/patch_triton.py b/vllm_ascend/patch/worker/patch_v2/patch_triton.py
index a910846d3a0..420b5b56298 100644
--- a/vllm_ascend/patch/worker/patch_v2/patch_triton.py
+++ b/vllm_ascend/patch/worker/patch_v2/patch_triton.py
@@ -29,7 +29,7 @@
 logprob.compute_token_logprobs = compute_token_logprobs
 structured_outputs._apply_grammar_bitmask_kernel = _apply_grammar_bitmask_kernel
 
-if not vllm_version_is("0.20.1"):
+if not vllm_version_is("0.20.2"):
     from vllm_ascend.worker.v2.spec_decode.probabilistic_rejection_sampler_utils import (
         probabilistic_rejection_sample as npu_probabilistic_rejection_sample,
     )
diff --git a/vllm_ascend/worker/v2/sample/logprob.py b/vllm_ascend/worker/v2/sample/logprob.py
index e2fabd5fad1..9d275fe5776 100644
--- a/vllm_ascend/worker/v2/sample/logprob.py
+++ b/vllm_ascend/worker/v2/sample/logprob.py
@@ -24,7 +24,7 @@
 from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num
 from vllm_ascend.utils import vllm_version_is
 
-if not vllm_version_is("0.20.1"):
+if not vllm_version_is("0.20.2"):
     from vllm.v1.worker.gpu.sample.logprob import LogprobTokenIdsState
 
 

From b717ac5e2215e3e22570271c6a59f168d6fd69b6 Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Tue, 12 May 2026 19:21:37 +0800
Subject: [PATCH 05/19] fix weight load error

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 vllm_ascend/patch/worker/patch_deepseek_mtp.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm_ascend/patch/worker/patch_deepseek_mtp.py b/vllm_ascend/patch/worker/patch_deepseek_mtp.py
index bc147d74858..ec763569ec0 100644
--- a/vllm_ascend/patch/worker/patch_deepseek_mtp.py
+++ b/vllm_ascend/patch/worker/patch_deepseek_mtp.py
@@ -12,7 +12,11 @@ def get_spec_layer_idx_from_weight_name(config: DeepseekV2Config | DeepseekV3Con
     if hasattr(config, "num_nextn_predict_layers") and config.num_nextn_predict_layers > 0:
         layer_idx = config.num_hidden_layers
         for i in range(config.num_nextn_predict_layers):
-            if weight_name.startswith(f"model.layers.{layer_idx + i}.") or weight_name.startswith(MTP_ROT_WEIGHT_NAME):
+            if (
+                weight_name.startswith(f"models.layers.{layer_idx + i}.")
+                or weight_name.startswith(MTP_ROT_WEIGHT_NAME)
+                or weight_name.startswith(f"layers.{layer_idx + i}.")
+            ):
                 return layer_idx + i
     return None
 

From 018d75b9456299635b4cde4a5abede768ced020f Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Tue, 12 May 2026 19:38:55 +0800
Subject: [PATCH 06/19] fix

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 .../patch/worker/patch_deepseek_mtp.py        | 24 +++++++++++++++++--
 vllm_ascend/worker/v2/block_table.py          |  4 ++--
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/vllm_ascend/patch/worker/patch_deepseek_mtp.py b/vllm_ascend/patch/worker/patch_deepseek_mtp.py
index ec763569ec0..bce4e826efd 100644
--- a/vllm_ascend/patch/worker/patch_deepseek_mtp.py
+++ b/vllm_ascend/patch/worker/patch_deepseek_mtp.py
@@ -5,6 +5,8 @@
 from vllm.config import VllmConfig
 from vllm.model_executor.models.deepseek_mtp import DeepSeekMTP, DeepSeekMultiTokenPredictorLayer
 
+from vllm_ascend.utils import vllm_version_is
+
 MTP_ROT_WEIGHT_NAME = "rot.weight"
 
 
@@ -21,6 +23,17 @@ def get_spec_layer_idx_from_weight_name(config: DeepseekV2Config | DeepseekV3Con
     return None
 
 
+def get_spec_layer_idx_from_weight_name_020(
+    config: DeepseekV2Config | DeepseekV3Config, weight_name: str
+) -> int | None:
+    if hasattr(config, "num_nextn_predict_layers") and config.num_nextn_predict_layers > 0:
+        layer_idx = config.num_hidden_layers
+        for i in range(config.num_nextn_predict_layers):
+            if weight_name.startswith(f"models.layers.{layer_idx + i}.") or weight_name.startswith(MTP_ROT_WEIGHT_NAME):
+                return layer_idx + i
+    return None
+
+
 class AscendDeepSeekMultiTokenPredictorLayer(DeepSeekMultiTokenPredictorLayer):
     def __init__(self, vllm_config: VllmConfig, prefix: str) -> None:
         super().__init__(vllm_config, prefix)
@@ -61,7 +74,14 @@ def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str:
             return f"model.layers.{spec_layer}.rot.weight"
 
 
-vllm.model_executor.models.deepseek_v2.get_spec_layer_idx_from_weight_name = get_spec_layer_idx_from_weight_name
-vllm.model_executor.models.deepseek_mtp.get_spec_layer_idx_from_weight_name = get_spec_layer_idx_from_weight_name
+if vllm_version_is("0.20.2"):
+    vllm.model_executor.models.deepseek_v2.get_spec_layer_idx_from_weight_name = get_spec_layer_idx_from_weight_name_020
+    vllm.model_executor.models.deepseek_mtp.get_spec_layer_idx_from_weight_name = (
+        get_spec_layer_idx_from_weight_name_020
+    )
+else:
+    vllm.model_executor.models.deepseek_v2.get_spec_layer_idx_from_weight_name = get_spec_layer_idx_from_weight_name
+    vllm.model_executor.models.deepseek_mtp.get_spec_layer_idx_from_weight_name = get_spec_layer_idx_from_weight_name
+
 vllm.model_executor.models.deepseek_mtp.DeepSeekMultiTokenPredictorLayer = AscendDeepSeekMultiTokenPredictorLayer
 vllm.model_executor.models.deepseek_mtp.DeepSeekMTP = AscendDeepSeekMTP
diff --git a/vllm_ascend/worker/v2/block_table.py b/vllm_ascend/worker/v2/block_table.py
index 8ce0d294b1a..94c7dd73dca 100644
--- a/vllm_ascend/worker/v2/block_table.py
+++ b/vllm_ascend/worker/v2/block_table.py
@@ -31,7 +31,7 @@ def __init__(
         block_sizes: list[int],
         max_num_reqs: int,
         max_num_batched_tokens: int,
-        max_model_len: int,
+        max_num_blocks_per_group: list[int],
         device: torch.device,
         cp_size: int = 1,
         cp_rank: int = 0,
@@ -41,7 +41,7 @@ def __init__(
             block_sizes,
             max_num_reqs,
             max_num_batched_tokens,
-            max_model_len,
+            max_num_blocks_per_group,
             device,
             cp_size,
             cp_rank,

From f68a9fb2ef4a0b3a66739a3c077f27ff97f873a9 Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Tue, 12 May 2026 20:03:58 +0800
Subject: [PATCH 07/19] adapt mdrnV2

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 vllm_ascend/worker/v2/attn_utils.py           | 29 ++++++++++++++++---
 vllm_ascend/worker/v2/model_runner.py         | 14 ++++++---
 vllm_ascend/worker/v2/model_states/default.py |  1 +
 3 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/vllm_ascend/worker/v2/attn_utils.py b/vllm_ascend/worker/v2/attn_utils.py
index 71c3efccae4..fbc4c68794a 100644
--- a/vllm_ascend/worker/v2/attn_utils.py
+++ b/vllm_ascend/worker/v2/attn_utils.py
@@ -34,6 +34,7 @@
     MLAAttentionSpec,
     UniformTypeKVCacheSpecs,
 )
+from vllm.v1.worker.gpu.model_states.interface import ModelSpecificAttnMetadata
 from vllm.v1.worker.utils import AttentionGroup
 
 from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
@@ -75,6 +76,8 @@ def build_attn_metadata(
     graph_pad_size: int = -1,
     num_input_tokens: int = 0,
     prefill_context_parallel_metadata: AscendPrefillContextParallelMetadata | None = None,
+    model_specific_attn_metadata: ModelSpecificAttnMetadata | None = None,
+    for_cudagraph_capture: bool = False,
 ) -> dict[str, Any]:
     """Build attention metadata for Ascend NPUs."""
     # TODO(Ronald1995): optimize AscendCommonAttentionMetadata.
@@ -92,6 +95,11 @@ def build_attn_metadata(
         block_table = block_tables[i]
         slot_mapping = slot_mappings[i]
 
+        common_attn_metadata_extra_kwargs = (
+            model_specific_attn_metadata.get_extra_common_attn_kwargs(i, num_reqs)
+            if model_specific_attn_metadata is not None
+            else {}
+        )
         common_attn_metadata = AscendCommonAttentionMetadata(
             query_start_loc=query_start_loc_gpu,
             query_start_loc_cpu=query_start_loc_cpu,
@@ -109,14 +117,27 @@ def build_attn_metadata(
             num_input_tokens=num_input_tokens,
             prefill_context_parallel_metadata=prefill_context_parallel_metadata,
             max_seq_len=max_seq_len,
+            **common_attn_metadata_extra_kwargs,
         )
 
         for attn_group in attn_groups[i]:
             attn_metadata_builder = attn_group.get_metadata_builder(0)
-            metadata = attn_metadata_builder.build(
-                common_prefix_len=0,
-                common_attn_metadata=common_attn_metadata,
-            )
+            if for_cudagraph_capture:
+                metadata = attn_metadata_builder.build_for_cudagraph_capture(common_attn_metadata)
+            else:
+                attn_metadata_extra_kwargs = (
+                    model_specific_attn_metadata.get_extra_attn_kwargs(
+                        attn_metadata_builder,
+                        num_reqs,
+                    )
+                    if model_specific_attn_metadata is not None
+                    else {}
+                )
+                metadata = attn_metadata_builder.build(
+                    common_prefix_len=0,
+                    common_attn_metadata=common_attn_metadata,
+                    **attn_metadata_extra_kwargs,
+                )
             for layer_name in attn_group.layer_names:
                 attn_metadata[layer_name] = metadata
     return attn_metadata
diff --git a/vllm_ascend/worker/v2/model_runner.py b/vllm_ascend/worker/v2/model_runner.py
index a2903dca9c4..f09be6a5e63 100644
--- a/vllm_ascend/worker/v2/model_runner.py
+++ b/vllm_ascend/worker/v2/model_runner.py
@@ -208,6 +208,7 @@ def prepare_inputs(
 
         # Get the number of draft tokens for each request.
         draft_tokens = scheduler_output.scheduled_spec_decode_tokens
+        num_draft_tokens_per_req: np.ndarray | None = None
         if not draft_tokens:
             # No draft token scheduled (common case).
             total_num_draft_tokens = 0
@@ -217,14 +218,14 @@ def prepare_inputs(
             expanded_idx_mapping = idx_mapping
             expanded_local_pos = torch.zeros(num_reqs, dtype=torch.int32, device=self.device)
         else:
-            num_draft_tokens = np.array(
+            num_draft_tokens_per_req = np.array(
                 [len(draft_tokens.get(req_id, ())) for req_id in req_ids],
                 dtype=np.int32,
             )
-            total_num_draft_tokens = int(num_draft_tokens.sum())
+            total_num_draft_tokens = int(num_draft_tokens_per_req.sum())
             total_num_logits = num_reqs + total_num_draft_tokens
 
-            num_logits = num_draft_tokens + 1
+            num_logits = num_draft_tokens_per_req + 1
             cu_num_logits_np = np.empty(num_reqs + 1, dtype=np.int32)
             cu_num_logits_np[0] = 0
             np.cumsum(num_logits, out=cu_num_logits_np[1:])
@@ -261,9 +262,12 @@ def prepare_inputs(
 
         query_start_loc_np = query_start_loc_np[: num_reqs_padded + 1]
         query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1]
+        is_prefilling_np = (
+            self.req_states.num_computed_prefill_tokens[idx_mapping_np] < self.req_states.prefill_len.np[idx_mapping_np]
+        )
 
         # Get prefill tokens if any.
-        if self.req_states.any_prefills(idx_mapping_np):
+        if np.any(is_prefilling_np):
             prepare_prefill_inputs(
                 self.input_buffers.input_ids,
                 self.req_states.next_prefill_tokens,
@@ -326,11 +330,13 @@ def prepare_inputs(
             num_tokens=num_tokens,
             num_tokens_after_padding=num_tokens_after_padding,
             num_draft_tokens=total_num_draft_tokens,
+            num_draft_tokens_per_req=num_draft_tokens_per_req,
             query_start_loc=query_start_loc,
             query_start_loc_np=query_start_loc_np,
             seq_lens=seq_lens,
             seq_lens_cpu_upper_bound=seq_lens_cpu_upper_bound,
             dcp_local_seq_lens=None,  # TODO(Ronald1995): support cp.
+            is_prefilling_np=is_prefilling_np,
             input_ids=input_ids,
             positions=positions,
             logits_indices=logits_indices,
diff --git a/vllm_ascend/worker/v2/model_states/default.py b/vllm_ascend/worker/v2/model_states/default.py
index 90546de5cf1..9dd9d888ead 100644
--- a/vllm_ascend/worker/v2/model_states/default.py
+++ b/vllm_ascend/worker/v2/model_states/default.py
@@ -72,5 +72,6 @@ def prepare_attn(
             seq_lens_np=input_batch.seq_lens_np,
             positions=input_batch.positions,
             attn_state=input_batch.attn_state,
+            for_cudagraph_capture=for_capture,
         )
         return self.attn_metadata

From 72d571e9b1fd1998838ffb321c7adaa9a05cb8b3 Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Tue, 12 May 2026 23:40:24 +0800
Subject: [PATCH 08/19] fix patch

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 vllm_ascend/patch/worker/__init__.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py
index 38dd6fcde8e..d5827935510 100644
--- a/vllm_ascend/patch/worker/__init__.py
+++ b/vllm_ascend/patch/worker/__init__.py
@@ -17,14 +17,13 @@
 
 from vllm.triton_utils import HAS_TRITON
 
-from vllm_ascend.utils import is_310p
+from vllm_ascend.utils import is_310p, vllm_version_is
 
 if HAS_TRITON:
     import vllm_ascend.patch.worker.patch_triton
     import vllm_ascend.patch.worker.patch_v2.patch_triton  # noqa
 
 
-# isort: off
 import vllm_ascend.patch.worker.patch_weight_utils  # noqa
 import vllm_ascend.patch.platform.patch_sched_yield  # noqa
 import vllm_ascend.patch.worker.patch_bert  # noqa
@@ -53,4 +52,6 @@
 import vllm_ascend.patch.worker.patch_v2.patch_model_state  # noqa
 import vllm_ascend.patch.worker.patch_v2.patch_block_table  # noqa
 import vllm_ascend.patch.worker.patch_gqa_c8  # noqa
-import vllm_ascend.patch.worker.patch_v2.patch_attn_utils  # noqa
+
+if not vllm_version_is("0.20.2"):
+    import vllm_ascend.patch.worker.patch_v2.patch_attn_utils  # noqa

From d3a30f6ea5571d06a92fd32bffed94ade16009fe Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Tue, 12 May 2026 23:39:01 +0800
Subject: [PATCH 09/19] fix

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 vllm_ascend/worker/model_runner_v1.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 316f0a06047..ac9dfd0278d 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2273,8 +2273,11 @@ def _pad_for_sequence_parallelism(self, num_scheduled_tokens: int) -> int:
             return round_up(num_scheduled_tokens, tp_size)
         return num_scheduled_tokens
 
-    # This is a function from the upstream vllm used to handle PP+SP. Since the judgment logic 
-    # of flashcomm1 in Ascend is inconsistent with SP in vllm, it needs to be overridden.
+    # These functions from upstream vllm handle PP+SP. Ascend's flashcomm1 SP
+    # differs from vllm's native SP: flashcomm1 does NOT scatter the residual
+    # before PP send, so the all_gather in sync_and_gather_intermediate_tensors
+    # must be skipped. Both overrides use enable_sp() rather than
+    # is_residual_scattered_for_sp() to reflect the actual Ascend SP state.
     def sync_and_slice_intermediate_tensors(
         self,
         num_tokens: int,
@@ -2284,8 +2287,6 @@ def sync_and_slice_intermediate_tensors(
         assert self.intermediate_tensors is not None
         tp = self.vllm_config.parallel_config.tensor_parallel_size
 
-        # When sequence parallelism is enabled, the "residual" tensor is sharded
-        # across tensor parallel ranks, so each rank only needs its own slice.
         if sync_self:
             assert intermediate_tensors is not None
             for k, v in intermediate_tensors.items():
@@ -2303,6 +2304,19 @@ def sync_and_slice_intermediate_tensors(
             }
         )
 
+    def sync_and_gather_intermediate_tensors(
+        self,
+        num_tokens: int,
+        intermediate_tensors: IntermediateTensors | None,
+        sync_self: bool,
+    ) -> IntermediateTensors:
+        # vllm renamed sync_and_slice to sync_and_gather in v0.20.2.
+        # The Ascend override logic is identical: skip the upstream all_gather
+        # (flashcomm1 does not scatter residual before PP send).
+        return self.sync_and_slice_intermediate_tensors(
+            num_tokens, intermediate_tensors, sync_self
+        )
+
     def _determine_batch_execution_and_padding(
         self,
         num_tokens: int,

From 86dac634b418121fda4727104fc6ede03751970a Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Wed, 13 May 2026 10:34:05 +0800
Subject: [PATCH 10/19] fix gdn

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 vllm_ascend/ops/gdn.py | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/vllm_ascend/ops/gdn.py b/vllm_ascend/ops/gdn.py
index 5254d7bc27a..10dfc6495a0 100644
--- a/vllm_ascend/ops/gdn.py
+++ b/vllm_ascend/ops/gdn.py
@@ -32,6 +32,7 @@
 from vllm_ascend.ops.triton.fla.utils import clear_ssm_states
 from vllm_ascend.ops.triton.fused_gdn_gating import fused_gdn_gating_patch
 from vllm_ascend.ops.triton.mamba.causal_conv1d import causal_conv1d_update_npu
+from vllm_ascend.utils import vllm_version_is
 
 
 def to_int64_tuple(tensor: torch.Tensor) -> tuple[int, ...]:
@@ -124,14 +125,23 @@ def forward(
             device=hidden_states.device,
         )
 
-        torch.ops.vllm.gdn_attention_core(
-            mixed_qkv,
-            b,
-            a,
-            core_attn_out,
-            False,
-            self.prefix,
-        )
+        if vllm_version_is("0.20.2"):
+            torch.ops.vllm.gdn_attention_core(
+                mixed_qkv,
+                b,
+                a,
+                core_attn_out,
+                self.prefix,
+            )
+        else:
+            torch.ops.vllm.gdn_attention_core(
+                mixed_qkv,
+                b,
+                a,
+                core_attn_out,
+                False,
+                self.prefix,
+            )
 
         # ============================================================
         # Part 3: Output Projection

From a4cee58053b3224d19ad3d217ded25632952d345 Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Wed, 13 May 2026 11:18:07 +0800
Subject: [PATCH 11/19] fix patch

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 vllm_ascend/patch/worker/patch_deepseek_mtp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm_ascend/patch/worker/patch_deepseek_mtp.py b/vllm_ascend/patch/worker/patch_deepseek_mtp.py
index bce4e826efd..696d3e55127 100644
--- a/vllm_ascend/patch/worker/patch_deepseek_mtp.py
+++ b/vllm_ascend/patch/worker/patch_deepseek_mtp.py
@@ -15,7 +15,7 @@ def get_spec_layer_idx_from_weight_name(config: DeepseekV2Config | DeepseekV3Con
         layer_idx = config.num_hidden_layers
         for i in range(config.num_nextn_predict_layers):
             if (
-                weight_name.startswith(f"models.layers.{layer_idx + i}.")
+                weight_name.startswith(f"model.layers.{layer_idx + i}.")
                 or weight_name.startswith(MTP_ROT_WEIGHT_NAME)
                 or weight_name.startswith(f"layers.{layer_idx + i}.")
             ):
@@ -29,7 +29,7 @@ def get_spec_layer_idx_from_weight_name_020(
     if hasattr(config, "num_nextn_predict_layers") and config.num_nextn_predict_layers > 0:
         layer_idx = config.num_hidden_layers
         for i in range(config.num_nextn_predict_layers):
-            if weight_name.startswith(f"models.layers.{layer_idx + i}.") or weight_name.startswith(MTP_ROT_WEIGHT_NAME):
+            if weight_name.startswith(f"model.layers.{layer_idx + i}.") or weight_name.startswith(MTP_ROT_WEIGHT_NAME):
                 return layer_idx + i
     return None
 

From b95534af97f9bb1bc25d27c23c949e881bca5dc0 Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Wed, 13 May 2026 11:32:00 +0800
Subject: [PATCH 12/19] fix lint

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 vllm_ascend/worker/v2/model_runner.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm_ascend/worker/v2/model_runner.py b/vllm_ascend/worker/v2/model_runner.py
index f09be6a5e63..2236b76db8f 100644
--- a/vllm_ascend/worker/v2/model_runner.py
+++ b/vllm_ascend/worker/v2/model_runner.py
@@ -218,14 +218,15 @@ def prepare_inputs(
             expanded_idx_mapping = idx_mapping
             expanded_local_pos = torch.zeros(num_reqs, dtype=torch.int32, device=self.device)
         else:
-            num_draft_tokens_per_req = np.array(
+            num_draft_tokens_arr = np.array(
                 [len(draft_tokens.get(req_id, ())) for req_id in req_ids],
                 dtype=np.int32,
             )
-            total_num_draft_tokens = int(num_draft_tokens_per_req.sum())
+            num_draft_tokens_per_req = num_draft_tokens_arr
+            total_num_draft_tokens = int(num_draft_tokens_arr.sum())
             total_num_logits = num_reqs + total_num_draft_tokens
 
-            num_logits = num_draft_tokens_per_req + 1
+            num_logits = num_draft_tokens_arr + 1
             cu_num_logits_np = np.empty(num_reqs + 1, dtype=np.int32)
             cu_num_logits_np[0] = 0
             np.cumsum(num_logits, out=cu_num_logits_np[1:])

From a0b93d5d21d919abd702726133a38de863c9fb37 Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Wed, 13 May 2026 14:49:37 +0800
Subject: [PATCH 13/19] disable async schedule when
 enable_return_routed_experts

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py b/tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py
index 97f90698bc0..b22d3504a10 100644
--- a/tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py
+++ b/tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py
@@ -19,6 +19,7 @@ def test_qwen3_moe_routing_replay():
         cudagraph_capture_sizes=[1, 2, 4, 8],
         distributed_executor_backend="mp",
         enable_return_routed_experts=True,
+        async_scheduling=False,
     ) as vllm_model:
         sampling_params = SamplingParams(
             max_tokens=5, temperature=0.8, top_p=0.95, output_kind=RequestOutputKind.FINAL_ONLY

From 0396b2f9c5f891182f0cde9fe764201535df75c5 Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Wed, 13 May 2026 15:30:06 +0800
Subject: [PATCH 14/19] async D2H pipeline for routed expert replay

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 vllm_ascend/ops/fused_moe/fused_moe.py        |  14 +-
 .../ops/fused_moe/routed_experts_compat.py    | 126 ++++++++++++++++++
 vllm_ascend/worker/model_runner_v1.py         |  29 ++--
 3 files changed, 151 insertions(+), 18 deletions(-)
 create mode 100644 vllm_ascend/ops/fused_moe/routed_experts_compat.py

diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py
index 4f1c5eebcec..46730a98366 100644
--- a/vllm_ascend/ops/fused_moe/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -27,7 +27,6 @@
 from vllm.logger import logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map
-from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsCapturer
 from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner  # type: ignore
 
 import vllm_ascend.envs as envs_ascend
@@ -36,6 +35,7 @@
 from vllm_ascend.distributed.parallel_state import get_mc2_group
 from vllm_ascend.eplb.core.eplb_utils import init_eplb_config
 from vllm_ascend.flash_common3_context import get_flash_common3_context, set_flash_common3_context
+from vllm_ascend.ops.fused_moe import routed_experts_compat
 from vllm_ascend.ops.fused_moe.experts_selector import select_experts, zero_experts_compute
 from vllm_ascend.ops.fused_moe.moe_comm_method import AllGatherCommImpl, FusedExpertsResult, setup_moe_comm_method
 from vllm_ascend.ops.fused_moe.moe_runtime_args import build_fused_experts_input
@@ -159,12 +159,12 @@ def apply(
             num_experts=num_logical_experts,
         )
         if layer.vllm_config.model_config is not None and layer.vllm_config.model_config.enable_return_routed_experts:
-            capturer = RoutedExpertsCapturer.get_instance()
-            if capturer is not None:
-                capturer.capture(
-                    layer_id=layer.layer_id,
-                    topk_ids=topk_ids,
-                )
+            capturer = routed_experts_compat.get_capturer()
+            routed_experts_compat.call_capture(
+                capturer,
+                layer_id=layer.layer_id,
+                topk_ids=topk_ids,
+            )
 
         if zero_expert_num > 0 and zero_expert_type is not None:
             topk_ids, topk_weights, zero_expert_result = zero_experts_compute(
diff --git a/vllm_ascend/ops/fused_moe/routed_experts_compat.py b/vllm_ascend/ops/fused_moe/routed_experts_compat.py
new file mode 100644
index 00000000000..7f103134749
--- /dev/null
+++ b/vllm_ascend/ops/fused_moe/routed_experts_compat.py
@@ -0,0 +1,126 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+"""Compatibility shim around vLLM's RoutedExpertsCapturer.
+
+vLLM PR #39917 (post-0.20.2) reworked the routed-experts pipeline:
+- 0.20.2 exposed `RoutedExpertsCapturer.get_instance()` plus
+  `clear_buffer()` / `save_captured_experts(indices=...)` methods.
+- main moved to module-level helpers (`get_global_experts_capturer`,
+  `issue_routing_d2h_copy`, `extract_routed_experts_for_current_batch`,
+  `free_routing_buffers`, `init_routed_experts_capturer_with_shared_cache`).
+
+We support exactly two vLLM versions: 0.20.2 (old API) and main (new
+API). Selection is via `vllm_version_is("0.20.2")` -- explicit, no
+runtime symbol probing.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+import torch
+from vllm.model_executor.layers.fused_moe import routed_experts_capturer as _rec
+
+from vllm_ascend.utils import vllm_version_is
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+
+USE_LEGACY_API = vllm_version_is("0.20.2")
+
+
+def get_capturer():
+    """Return the global capturer instance, or None if not initialized."""
+    if USE_LEGACY_API:
+        return _rec.RoutedExpertsCapturer.get_instance()
+    return _rec.get_global_experts_capturer()
+
+
+def clear_step_buffers(scheduler_output: SchedulerOutput) -> None:
+    """Free per-request routing buffers for finished/preempted reqs.
+
+    main: `free_routing_buffers(finished, preempted)`.
+    0.20.2: `capturer.clear_buffer()` (full-buffer reset).
+    """
+    if USE_LEGACY_API:
+        capturer = get_capturer()
+        if capturer is not None:
+            capturer.clear_buffer()
+        return
+
+    _rec.free_routing_buffers(
+        scheduler_output.finished_req_ids,
+        getattr(scheduler_output, "preempted_req_ids", None),
+    )
+
+
+def issue_d2h_copy(
+    *,
+    input_batch_req_ids: list[str],
+    num_scheduled_tokens: dict[str, int],
+    positions: torch.Tensor,
+    positions_cpu: torch.Tensor | None,
+    legacy_indices: torch.Tensor | None = None,
+) -> None:
+    """Trigger the per-step D2H copy of routed experts.
+
+    main: `issue_routing_d2h_copy(...)` (async copy).
+    0.20.2: `capturer.save_captured_experts(indices=legacy_indices)`.
+    """
+    if USE_LEGACY_API:
+        capturer = get_capturer()
+        if capturer is not None:
+            capturer.save_captured_experts(indices=legacy_indices)
+        return
+
+    _rec.issue_routing_d2h_copy(
+        input_batch_req_ids=input_batch_req_ids,
+        num_scheduled_tokens=num_scheduled_tokens,
+        positions=positions,
+        positions_cpu=positions_cpu,
+    )
+
+
+def extract_for_current_batch(
+    *,
+    req_ids: list[str],
+    requests: dict,
+    req_id_to_index: dict[str, int],
+    num_tokens_no_spec: np.ndarray,
+    max_model_len: int,
+) -> dict[str, np.ndarray] | None:
+    """Pull routing data for requests finishing this step.
+
+    main: `extract_routed_experts_for_current_batch(...)`.
+    0.20.2: routing data flows through a different channel inside
+    `save_captured_experts`, so this returns None.
+    """
+    if USE_LEGACY_API:
+        return None
+    return _rec.extract_routed_experts_for_current_batch(
+        req_ids=req_ids,
+        requests=requests,
+        req_id_to_index=req_id_to_index,
+        num_tokens_no_spec=num_tokens_no_spec,
+        max_model_len=max_model_len,
+    )
+
+
+def call_capture(capturer, *, layer_id: int, topk_ids: torch.Tensor) -> None:
+    """Invoke `.capture(...)` on a capturer instance.
+
+    Both 0.20.2 and main expose `capture(layer_id, topk_ids)`, so this
+    is a thin pass-through kept for symmetry with the other helpers.
+    """
+    if capturer is None:
+        return
+    capturer.capture(layer_id=layer_id, topk_ids=topk_ids)
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index ac9dfd0278d..13d42c546da 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -150,7 +150,7 @@
     set_mc2_mask,
     set_mc2_tokens_capacity,
 )
-from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsCapturer
+from vllm_ascend.ops.fused_moe import routed_experts_compat
 
 if TYPE_CHECKING:
     import xgrammar as xgr  # type: ignore[import-untyped]
@@ -1498,11 +1498,7 @@ def execute_model(
         intermediate_tensors: IntermediateTensors | None = None,
     ) -> ModelRunnerOutput | IntermediateTensors | None:
         if self.vllm_config.model_config.enable_return_routed_experts:
-            capturer = RoutedExpertsCapturer.get_instance()
-            if capturer is not None:
-                capturer.clear_buffer()
-            else:
-                logger.warning("RoutedExpertsCapturer is not initialized.")
+            routed_experts_compat.clear_step_buffers(scheduler_output)
 
         if self.ascend_config.profiling_chunk_config.need_timing:
             # Check if the scheduler signaled that calibration is complete.
@@ -1968,12 +1964,22 @@ def propose_draft_token_ids(sampled_token_ids):
             if self.speculative_config is not None:
                 self.finalize_kv_connector()
 
+        routed_experts_dict = None
         if self.model_config.enable_return_routed_experts:
-            capturer = RoutedExpertsCapturer.get_instance()
-            if capturer is not None:
-                capturer.save_captured_experts(indices=self.cpu_slot_mapping)
-            else:
-                logger.warning("RoutedExpertsCapturer is not initialized.")
+            routed_experts_compat.issue_d2h_copy(
+                input_batch_req_ids=self.input_batch.req_ids,
+                num_scheduled_tokens=scheduler_output.num_scheduled_tokens,
+                positions=self.positions,
+                positions_cpu=getattr(self, "_positions_cpu", None),
+                legacy_indices=self.cpu_slot_mapping,
+            )
+            routed_experts_dict = routed_experts_compat.extract_for_current_batch(
+                req_ids=req_ids_output_copy,
+                requests=self.requests,
+                req_id_to_index=self.input_batch.req_id_to_index,
+                num_tokens_no_spec=self.input_batch.num_tokens_no_spec,
+                max_model_len=self.max_model_len,
+            )
 
         model_runner_output = ModelRunnerOutput(
             req_ids=req_ids_output_copy,
@@ -1985,6 +1991,7 @@ def propose_draft_token_ids(sampled_token_ids):
             pooler_output=[],
             ec_connector_output=ec_connector_output if self.supports_mm_inputs else None,
             cudagraph_stats=cudagraph_stats,
+            routed_experts_dict=routed_experts_dict,
         )
         if self.ascend_config.profiling_chunk_config.need_timing and hasattr(self, '_execution_start_time'):
             self._sync_device()

From 0ac534a80062348822f01ad83b60256aec7d54aa Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Thu, 14 May 2026 09:16:20 +0800
Subject: [PATCH 15/19] adapt for 0202

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 vllm_ascend/worker/model_runner_v1.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 13d42c546da..4fe84c6f4a8 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -138,6 +138,7 @@
     lmhead_tp_enable,
     set_weight_prefetch_method,
     should_skip_allreduce_across_dp_group,
+    vllm_version_is,
 )
 from vllm_ascend.worker.npu_input_batch import NPUInputBatch
 from vllm_ascend.worker.pcp_utils import PCPManager
@@ -1991,7 +1992,9 @@ def propose_draft_token_ids(sampled_token_ids):
             pooler_output=[],
             ec_connector_output=ec_connector_output if self.supports_mm_inputs else None,
             cudagraph_stats=cudagraph_stats,
-            routed_experts_dict=routed_experts_dict,
+            **(
+                {} if vllm_version_is("0.20.2") else {"routed_experts_dict": routed_experts_dict}
+            ),
         )
         if self.ascend_config.profiling_chunk_config.need_timing and hasattr(self, '_execution_start_time'):
             self._sync_device()

From 541a61cc20658932b7465346bf4074865b6446d7 Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Thu, 14 May 2026 11:49:41 +0800
Subject: [PATCH 16/19] tiny fix

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 vllm_ascend/ops/fused_moe/routed_experts_compat.py | 6 ------
 vllm_ascend/worker/v2/attn_utils.py                | 6 ++++--
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/vllm_ascend/ops/fused_moe/routed_experts_compat.py b/vllm_ascend/ops/fused_moe/routed_experts_compat.py
index 7f103134749..2a01033b3d7 100644
--- a/vllm_ascend/ops/fused_moe/routed_experts_compat.py
+++ b/vllm_ascend/ops/fused_moe/routed_experts_compat.py
@@ -9,17 +9,11 @@
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 """Compatibility shim around vLLM's RoutedExpertsCapturer.
-
-vLLM PR #39917 (post-0.20.2) reworked the routed-experts pipeline:
 - 0.20.2 exposed `RoutedExpertsCapturer.get_instance()` plus
   `clear_buffer()` / `save_captured_experts(indices=...)` methods.
 - main moved to module-level helpers (`get_global_experts_capturer`,
   `issue_routing_d2h_copy`, `extract_routed_experts_for_current_batch`,
   `free_routing_buffers`, `init_routed_experts_capturer_with_shared_cache`).
-
-We support exactly two vLLM versions: 0.20.2 (old API) and main (new
-API). Selection is via `vllm_version_is("0.20.2")` -- explicit, no
-runtime symbol probing.
 """
 
 from __future__ import annotations
diff --git a/vllm_ascend/worker/v2/attn_utils.py b/vllm_ascend/worker/v2/attn_utils.py
index fbc4c68794a..ceda15a074c 100644
--- a/vllm_ascend/worker/v2/attn_utils.py
+++ b/vllm_ascend/worker/v2/attn_utils.py
@@ -34,14 +34,16 @@
     MLAAttentionSpec,
     UniformTypeKVCacheSpecs,
 )
-from vllm.v1.worker.gpu.model_states.interface import ModelSpecificAttnMetadata
 from vllm.v1.worker.utils import AttentionGroup
 
 from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata, AscendPrefillContextParallelMetadata
 from vllm_ascend.quantization.utils import enable_fa_quant
-from vllm_ascend.utils import calc_split_factor
+from vllm_ascend.utils import calc_split_factor, vllm_version_is
+
+if vllm_version_is("0.20.2"):
+    from vllm.v1.worker.gpu.model_states.interface import ModelSpecificAttnMetadata
 
 _ATTENTION_MASK_BUILDER = None
 

From 8ae2bfc8a3449a63b6b7f40a4b34d5a8959150b2 Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Thu, 14 May 2026 12:35:45 +0800
Subject: [PATCH 17/19] fix

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 vllm_ascend/worker/v2/attn_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_ascend/worker/v2/attn_utils.py b/vllm_ascend/worker/v2/attn_utils.py
index ceda15a074c..d2fd0f2e0d0 100644
--- a/vllm_ascend/worker/v2/attn_utils.py
+++ b/vllm_ascend/worker/v2/attn_utils.py
@@ -42,7 +42,7 @@
 from vllm_ascend.quantization.utils import enable_fa_quant
 from vllm_ascend.utils import calc_split_factor, vllm_version_is
 
-if vllm_version_is("0.20.2"):
+if not vllm_version_is("0.20.2"):
     from vllm.v1.worker.gpu.model_states.interface import ModelSpecificAttnMetadata
 
 _ATTENTION_MASK_BUILDER = None

From 9c4576e235f5e503d56ce4b5c7e917324bfa0205 Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Thu, 14 May 2026 12:36:38 +0800
Subject: [PATCH 18/19] fix

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 .github/workflows/_e2e_test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml
index 39bf9f1966f..eac9ea73826 100644
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -27,7 +27,7 @@ on:
       continue_on_error:
         required: false
         type: boolean
-        default: true
+        default: false
       # The following inputs are used by comment-triggered E2E tests (/e2e <tests>).
       # They carry space-separated pytest paths, categorized by runner type.
       # Leave empty (default) when running label-triggered full/light suites.

From 6cdb1aded3b7898f84ae5054a8803efb606e6e5d Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Thu, 14 May 2026 15:14:15 +0800
Subject: [PATCH 19/19] remove mdrnv2 for v0.20.2

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 vllm_ascend/patch/worker/__init__.py    | 17 +++++++++++------
 vllm_ascend/worker/v2/attn_utils.py     |  6 ++----
 vllm_ascend/worker/v2/sample/logprob.py |  7 ++-----
 vllm_ascend/worker/worker.py            |  4 ++++
 4 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py
index d5827935510..6dab60c7701 100644
--- a/vllm_ascend/patch/worker/__init__.py
+++ b/vllm_ascend/patch/worker/__init__.py
@@ -19,9 +19,14 @@
 
 from vllm_ascend.utils import is_310p, vllm_version_is
 
+# v2 model runner is only supported on vllm > 0.20.2.
+_V2_MODEL_RUNNER_SUPPORTED = not vllm_version_is("0.20.2")
+
 if HAS_TRITON:
     import vllm_ascend.patch.worker.patch_triton
-    import vllm_ascend.patch.worker.patch_v2.patch_triton  # noqa
+
+    if _V2_MODEL_RUNNER_SUPPORTED:
+        import vllm_ascend.patch.worker.patch_v2.patch_triton  # noqa
 
 
 import vllm_ascend.patch.worker.patch_weight_utils  # noqa
@@ -41,17 +46,17 @@
 else:
     import vllm_ascend.patch.worker.patch_idex_310  # noqa
 import vllm_ascend.patch.worker.patch_rejection_sampler  # noqa
-import vllm_ascend.patch.worker.patch_v2.patch_uva  # noqa
 import vllm_ascend.patch.worker.patch_huanyuan_vl  # noqa
 import vllm_ascend.patch.worker.patch_npugraph_ex_triton  # noqa
 import vllm_ascend.patch.worker.patch_kimi_k25  # noqa
 import vllm_ascend.patch.worker.patch_draft_quarot  # noqa
 import vllm_ascend.patch.worker.patch_cudagraph  # noqa
 import vllm_ascend.patch.worker.patch_deepseek_mtp  # noqa
-import vllm_ascend.patch.worker.patch_v2.patch_input_batch  # noqa
-import vllm_ascend.patch.worker.patch_v2.patch_model_state  # noqa
-import vllm_ascend.patch.worker.patch_v2.patch_block_table  # noqa
 import vllm_ascend.patch.worker.patch_gqa_c8  # noqa
 
-if not vllm_version_is("0.20.2"):
+if _V2_MODEL_RUNNER_SUPPORTED:
+    import vllm_ascend.patch.worker.patch_v2.patch_uva  # noqa
+    import vllm_ascend.patch.worker.patch_v2.patch_input_batch  # noqa
+    import vllm_ascend.patch.worker.patch_v2.patch_model_state  # noqa
+    import vllm_ascend.patch.worker.patch_v2.patch_block_table  # noqa
     import vllm_ascend.patch.worker.patch_v2.patch_attn_utils  # noqa
diff --git a/vllm_ascend/worker/v2/attn_utils.py b/vllm_ascend/worker/v2/attn_utils.py
index d2fd0f2e0d0..fbc4c68794a 100644
--- a/vllm_ascend/worker/v2/attn_utils.py
+++ b/vllm_ascend/worker/v2/attn_utils.py
@@ -34,16 +34,14 @@
     MLAAttentionSpec,
     UniformTypeKVCacheSpecs,
 )
+from vllm.v1.worker.gpu.model_states.interface import ModelSpecificAttnMetadata
 from vllm.v1.worker.utils import AttentionGroup
 
 from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata, AscendPrefillContextParallelMetadata
 from vllm_ascend.quantization.utils import enable_fa_quant
-from vllm_ascend.utils import calc_split_factor, vllm_version_is
-
-if not vllm_version_is("0.20.2"):
-    from vllm.v1.worker.gpu.model_states.interface import ModelSpecificAttnMetadata
+from vllm_ascend.utils import calc_split_factor
 
 _ATTENTION_MASK_BUILDER = None
 
diff --git a/vllm_ascend/worker/v2/sample/logprob.py b/vllm_ascend/worker/v2/sample/logprob.py
index 9d275fe5776..d60f3d0d08f 100644
--- a/vllm_ascend/worker/v2/sample/logprob.py
+++ b/vllm_ascend/worker/v2/sample/logprob.py
@@ -20,12 +20,9 @@
 import torch
 from vllm.triton_utils import tl, triton
 from vllm.v1.outputs import LogprobsTensors
+from vllm.v1.worker.gpu.sample.logprob import LogprobTokenIdsState
 
 from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num
-from vllm_ascend.utils import vllm_version_is
-
-if not vllm_version_is("0.20.2"):
-    from vllm.v1.worker.gpu.sample.logprob import LogprobTokenIdsState
 
 
 @triton.jit
@@ -124,7 +121,7 @@ def compute_topk_logprobs(
     num_logprobs: int,
     sampled_token_ids: torch.Tensor,
     cu_num_logits: list[int] | None = None,
-    logprob_token_ids_state: "LogprobTokenIdsState | None" = None,
+    logprob_token_ids_state: LogprobTokenIdsState | None = None,
     expanded_idx_mapping: torch.Tensor | None = None,
     max_per_req_token_ids: int = 0,
 ) -> LogprobsTensors:
diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py
index 722ec077021..f412f51970e 100644
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -60,6 +60,7 @@
     enable_sp,
     get_ascend_device_type,
     register_ascend_customop,
+    vllm_version_is,
 )
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
 
@@ -136,6 +137,9 @@ def __init__(
             WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")
 
         self.use_v2_model_runner = envs_vllm.VLLM_USE_V2_MODEL_RUNNER
+        if self.use_v2_model_runner and vllm_version_is("0.20.2"):
+            logger.warning("VLLM_USE_V2_MODEL_RUNNER is not supported on vllm 0.20.2; falling back to v1 model runner.")
+            self.use_v2_model_runner = False
         self._pp_send_work: list[Handle] = []
 
         ascend_compilation_config = get_ascend_config().ascend_compilation_config