From e6a001e5d9178b20d347ae0c3402bb3b1a61503d Mon Sep 17 00:00:00 2001 From: wangli Date: Mon, 11 May 2026 15:04:47 +0800 Subject: [PATCH 01/19] upgrade vllm to 0511 Signed-off-by: wangli --- .github/workflows/dockerfiles/Dockerfile.lint | 6 ++++-- .github/workflows/pr_test_full.yaml | 2 +- .github/workflows/pr_test_light.yaml | 6 +++--- .github/workflows/schedule_lint_image_build.yaml | 8 -------- docs/source/conf.py | 2 +- 5 files changed, 9 insertions(+), 15 deletions(-) diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index 1cbc47ba549..bacf219e1d6 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,8 +27,10 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_TAG=v0.20.1 -RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm +ARG VLLM_COMMIT=05d610e5cdb710bffdccdda630b4eb0f79afd76d +RUN git init /vllm-workspace/vllm && \ + git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ + git -C /vllm-workspace/vllm checkout FETCH_HEAD # # Install vLLM common dependencies RUN python3 -m pip install -r /vllm-workspace/vllm/requirements/common.txt --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index b308103072f..0b5a5313418 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -80,7 +80,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1] + vllm_version: [05d610e5cdb710bffdccdda630b4eb0f79afd76d, v0.20.1] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 6436962fef1..b35212d5559 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: c7aa186d67b6f051680831418e957c67f34ba7a2 + vllm: 05d610e5cdb710bffdccdda630b4eb0f79afd76d changes: runs-on: linux-aarch64-a2b3-0 container: @@ -155,7 +155,7 @@ jobs: if: ${{ needs.lint.result == 'success' && needs.changes.outputs.has_tests == 'true' }} strategy: matrix: - vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1] + vllm_version: [05d610e5cdb710bffdccdda630b4eb0f79afd76d, v0.20.1] uses: ./.github/workflows/_optional_smart_e2e.yaml with: vllm: ${{ matrix.vllm_version }} @@ -165,7 +165,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1] + vllm_version: [05d610e5cdb710bffdccdda630b4eb0f79afd76d, v0.20.1] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_lint_image_build.yaml b/.github/workflows/schedule_lint_image_build.yaml index bf114e46d2d..36e012ff31f 100644 --- a/.github/workflows/schedule_lint_image_build.yaml +++ b/.github/workflows/schedule_lint_image_build.yaml @@ -4,12 +4,6 @@ on: # Runs at 00:00 UTC+8 every day - cron: '0 20 * * *' workflow_dispatch: - inputs: - vllm_hash: - description: 'vLLM base hash' - default: main - required: true - type: string push: paths: - '.github/workflows/dockerfiles/Dockerfile.lint' @@ -85,5 +79,3 @@ jobs: labels: ${{ steps.meta.outputs.labels }} tags: ${{ steps.meta.outputs.tags }} provenance: false - build-args: | - VLLM_HASH=${{ inputs.vllm_hash }} diff --git a/docs/source/conf.py b/docs/source/conf.py index 6e31082413c..51a6497cb71 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -81,7 +81,7 @@ # CANN image tag "cann_image_tag": "9.0.0-910b-ubuntu22.04-py3.11", # vLLM commit hash for main branch - "main_vllm_commit": "c7aa186d67b6f051680831418e957c67f34ba7a2", + "main_vllm_commit": "05d610e5cdb710bffdccdda630b4eb0f79afd76d", # vLLM tag for main branch "main_vllm_tag": "v0.20.1", # Python version for main branch From aea68398496225eece825d2607f72d003aa32f70 Mon Sep 17 00:00:00 2001 From: wangli Date: Mon, 11 May 2026 15:11:05 +0800 Subject: [PATCH 02/19] continue on error Signed-off-by: wangli --- .github/workflows/_e2e_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index eac9ea73826..39bf9f1966f 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -27,7 +27,7 @@ on: continue_on_error: required: false type: boolean - default: false + default: true # The following inputs are used by comment-triggered E2E tests (/e2e ). # They carry space-separated pytest paths, categorized by runner type. # Leave empty (default) when running label-triggered full/light suites. From 84552c3144d725b04d0e44e28d14cb9342d9896a Mon Sep 17 00:00:00 2001 From: wangli Date: Tue, 12 May 2026 10:02:20 +0800 Subject: [PATCH 03/19] fix Signed-off-by: wangli --- tests/ut/spec_decode/test_eagle_proposer.py | 10 ++++++++-- vllm_ascend/ops/gdn.py | 1 + vllm_ascend/patch/platform/patch_balance_schedule.py | 2 ++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/ut/spec_decode/test_eagle_proposer.py b/tests/ut/spec_decode/test_eagle_proposer.py index fd82351b726..3fc67227bd6 100644 --- a/tests/ut/spec_decode/test_eagle_proposer.py +++ b/tests/ut/spec_decode/test_eagle_proposer.py @@ -1385,10 +1385,13 @@ def check_mock(self): "method", "parallel_drafting", "draft_tensor_parallel_size", - "speculative_token_tree", "draft_model_config", "disable_padded_drafter_batch", } + # speculative_token_tree was removed in newer vllm (Remove tree attention #42121); + # only check for it when the installed version still carries the field. + if "speculative_token_tree" in vllm.config.SpeculativeConfig.__dataclass_fields__: + fields.add("speculative_token_tree") actual = set(vllm.config.SpeculativeConfig.__dataclass_fields__) missing = fields - actual @@ -2260,10 +2263,13 @@ def check_mock(self): "enforce_eager", "use_local_argmax_reduction", "draft_tensor_parallel_size", - "speculative_token_tree", "draft_model_config", "disable_padded_drafter_batch", } + # speculative_token_tree was removed in newer vllm (Remove tree attention #42121); + # only check for it when the installed version still carries the field. + if "speculative_token_tree" in vllm.config.SpeculativeConfig.__dataclass_fields__: + fields.add("speculative_token_tree") actual = set(vllm.config.SpeculativeConfig.__dataclass_fields__) missing = fields - actual assert not missing, f"Missing dataclass fields: {missing}" diff --git a/vllm_ascend/ops/gdn.py b/vllm_ascend/ops/gdn.py index ef061f2316e..5254d7bc27a 100644 --- a/vllm_ascend/ops/gdn.py +++ b/vllm_ascend/ops/gdn.py @@ -129,6 +129,7 @@ def forward( b, a, core_attn_out, + False, self.prefix, ) diff --git a/vllm_ascend/patch/platform/patch_balance_schedule.py b/vllm_ascend/patch/platform/patch_balance_schedule.py index 5711352aff3..b509fa14af4 100644 --- a/vllm_ascend/patch/platform/patch_balance_schedule.py +++ b/vllm_ascend/patch/platform/patch_balance_schedule.py @@ -32,6 +32,7 @@ def __init__( kv_cache_config: KVCacheConfig, structured_output_manager: StructuredOutputManager, block_size: int, + hash_block_size: int | None = None, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, include_finished_set: bool = False, log_stats: bool = False, @@ -41,6 +42,7 @@ def __init__( kv_cache_config, structured_output_manager, block_size, + hash_block_size, mm_registry, include_finished_set, log_stats, From a3296637a069d6e64eaf117010c8a218e259df4d Mon Sep 17 00:00:00 2001 From: wangli Date: Wed, 13 May 2026 11:19:03 +0800 Subject: [PATCH 04/19] fix Signed-off-by: wangli --- .github/workflows/dockerfiles/Dockerfile.lint | 2 +- .github/workflows/pr_test_full.yaml | 4 ++-- .github/workflows/pr_test_light.yaml | 6 +++--- .github/workflows/schedule_update_estimated_time.yaml | 2 +- .github/workflows/schedule_vllm_e2e_test.yaml | 2 +- Dockerfile | 2 +- Dockerfile.310p | 2 +- Dockerfile.310p.openEuler | 2 +- Dockerfile.a3 | 2 +- Dockerfile.a3.openEuler | 2 +- Dockerfile.openEuler | 2 +- docs/source/conf.py | 4 ++-- tests/e2e/multicard/2-cards/test_qwen3_moe.py | 2 +- tests/e2e/singlecard/model_runner_v2/test_basic.py | 4 ++-- vllm_ascend/core/scheduler_profiling_chunk.py | 4 ++-- vllm_ascend/patch/platform/patch_mla_prefill_backend.py | 2 +- vllm_ascend/patch/worker/patch_v2/patch_triton.py | 2 +- vllm_ascend/worker/v2/sample/logprob.py | 2 +- 18 files changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index bacf219e1d6..a362a0b269a 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=05d610e5cdb710bffdccdda630b4eb0f79afd76d +ARG VLLM_COMMIT=4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef RUN git init /vllm-workspace/vllm && \ git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ git -C /vllm-workspace/vllm checkout FETCH_HEAD diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 0b5a5313418..b02387cdf6f 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -80,7 +80,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [05d610e5cdb710bffdccdda630b4eb0f79afd76d, v0.20.1] + vllm_version: [4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef, v0.20.2] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml @@ -102,7 +102,7 @@ jobs: strategy: fail-fast: false matrix: - vllm_version: [v0.20.1] + vllm_version: [v0.20.2] needs: [parse-trigger] if: ${{ needs.parse-trigger.outputs.allowed == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index b35212d5559..50fe8e79f29 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: 05d610e5cdb710bffdccdda630b4eb0f79afd76d + vllm: 4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef changes: runs-on: linux-aarch64-a2b3-0 container: @@ -155,7 +155,7 @@ jobs: if: ${{ needs.lint.result == 'success' && needs.changes.outputs.has_tests == 'true' }} strategy: matrix: - vllm_version: [05d610e5cdb710bffdccdda630b4eb0f79afd76d, v0.20.1] + vllm_version: [4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef, v0.20.2] uses: ./.github/workflows/_optional_smart_e2e.yaml with: vllm: ${{ matrix.vllm_version }} @@ -165,7 +165,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [05d610e5cdb710bffdccdda630b4eb0f79afd76d, v0.20.1] + vllm_version: [4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef, v0.20.2] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_update_estimated_time.yaml b/.github/workflows/schedule_update_estimated_time.yaml index 904472e03bb..f72befae4b8 100644 --- a/.github/workflows/schedule_update_estimated_time.yaml +++ b/.github/workflows/schedule_update_estimated_time.yaml @@ -23,7 +23,7 @@ jobs: name: e2e-test strategy: matrix: - vllm_version: [v0.20.1] + vllm_version: [v0.20.2] type: [full, light] uses: ./.github/workflows/_e2e_test.yaml with: diff --git a/.github/workflows/schedule_vllm_e2e_test.yaml b/.github/workflows/schedule_vllm_e2e_test.yaml index c5fad253cce..805553bb2c6 100644 --- a/.github/workflows/schedule_vllm_e2e_test.yaml +++ b/.github/workflows/schedule_vllm_e2e_test.yaml @@ -47,7 +47,7 @@ jobs: fail-fast: false matrix: part: [0, 1, 2, 3] - vllm: [v0.20.1] + vllm: [v0.20.2] container: image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:9.0.0-910b-ubuntu22.04-py3.11 env: diff --git a/Dockerfile b/Dockerfile index 2c765c67e2c..2a8899df590 100644 --- a/Dockerfile +++ b/Dockerfile @@ -48,7 +48,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.20.1 +ARG VLLM_TAG=v0.20.2 RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.310p b/Dockerfile.310p index 20383636aa2..8252a25116e 100644 --- a/Dockerfile.310p +++ b/Dockerfile.310p @@ -33,7 +33,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.20.1 +ARG VLLM_TAG=v0.20.2 RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler index 77bc9b97103..72ef45c6f0e 100644 --- a/Dockerfile.310p.openEuler +++ b/Dockerfile.310p.openEuler @@ -32,7 +32,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.20.1 +ARG VLLM_TAG=v0.20.2 RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.a3 b/Dockerfile.a3 index 5ad57fe4b61..b740836546e 100644 --- a/Dockerfile.a3 +++ b/Dockerfile.a3 @@ -50,7 +50,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.20.1 +ARG VLLM_TAG=v0.20.2 RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler index 13f3e40f413..7167450d419 100644 --- a/Dockerfile.a3.openEuler +++ b/Dockerfile.a3.openEuler @@ -49,7 +49,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.20.1 +ARG VLLM_TAG=v0.20.2 RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index 026f88dd4e8..da6f1c98b7e 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -49,7 +49,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.20.1 +ARG VLLM_TAG=v0.20.2 RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/docs/source/conf.py b/docs/source/conf.py index 51a6497cb71..16e84286b03 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -81,9 +81,9 @@ # CANN image tag "cann_image_tag": "9.0.0-910b-ubuntu22.04-py3.11", # vLLM commit hash for main branch - "main_vllm_commit": "05d610e5cdb710bffdccdda630b4eb0f79afd76d", + "main_vllm_commit": "4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef", # vLLM tag for main branch - "main_vllm_tag": "v0.20.1", + "main_vllm_tag": "v0.20.2", # Python version for main branch "main_python_version": ">= 3.10, < 3.12", # CANN version for main branch diff --git a/tests/e2e/multicard/2-cards/test_qwen3_moe.py b/tests/e2e/multicard/2-cards/test_qwen3_moe.py index 1a87a9b3a1b..7a9f7cef927 100644 --- a/tests/e2e/multicard/2-cards/test_qwen3_moe.py +++ b/tests/e2e/multicard/2-cards/test_qwen3_moe.py @@ -75,7 +75,7 @@ def test_qwen3_moe_distributed_aiv_tp2(): vllm_model.generate_greedy(example_prompts, max_tokens) -@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1") +@pytest.mark.skipif(vllm_version_is("0.20.2"), reason="no need to support model_runner for v0.20.2") @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("enforce_eager", [True]) @patch.dict(os.environ, {"VLLM_USE_V2_MODEL_RUNNER": "1"}) diff --git a/tests/e2e/singlecard/model_runner_v2/test_basic.py b/tests/e2e/singlecard/model_runner_v2/test_basic.py index baad54d8fab..c6b4f245a04 100644 --- a/tests/e2e/singlecard/model_runner_v2/test_basic.py +++ b/tests/e2e/singlecard/model_runner_v2/test_basic.py @@ -65,7 +65,7 @@ def test_qwen3_dense_eager_mode( runner.model.generate(prompts, sampling_params) -@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1") +@pytest.mark.skipif(vllm_version_is("0.20.2"), reason="no need to support model_runner for v0.20.2") @pytest.mark.parametrize("model", MAIN_MODELS) @pytest.mark.parametrize("eagle_model", EGALE_MODELS) @pytest.mark.parametrize("max_tokens", [32]) @@ -104,7 +104,7 @@ def test_egale_spec_decoding( runner.model.generate(prompts, sampling_params) -@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1") +@pytest.mark.skipif(vllm_version_is("0.20.2"), reason="no need to support model_runner for v0.20.2") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("enforce_eager", [False]) diff --git a/vllm_ascend/core/scheduler_profiling_chunk.py b/vllm_ascend/core/scheduler_profiling_chunk.py index b79aae83ff0..3545fbf9224 100644 --- a/vllm_ascend/core/scheduler_profiling_chunk.py +++ b/vllm_ascend/core/scheduler_profiling_chunk.py @@ -577,7 +577,7 @@ def schedule(self) -> SchedulerOutput: # noqa: C901 num_encoder_tokens = sum(request.get_num_encoder_embeds(i) for i in encoder_inputs_to_schedule) if ( - vllm_version_is("0.20.1") + vllm_version_is("0.20.2") and self.scheduler_reserve_full_isl and not self.kv_cache_manager.can_fit_full_sequence( request, @@ -601,7 +601,7 @@ def schedule(self) -> SchedulerOutput: # noqa: C901 delay_cache_blocks=load_kv_async, num_encoder_tokens=num_encoder_tokens, **( - {} if vllm_version_is("0.20.1") else {"full_sequence_must_fit": self.scheduler_reserve_full_isl} + {} if vllm_version_is("0.20.2") else {"full_sequence_must_fit": self.scheduler_reserve_full_isl} ), ) diff --git a/vllm_ascend/patch/platform/patch_mla_prefill_backend.py b/vllm_ascend/patch/platform/patch_mla_prefill_backend.py index c904575cb3f..75615ed9a43 100644 --- a/vllm_ascend/patch/platform/patch_mla_prefill_backend.py +++ b/vllm_ascend/patch/platform/patch_mla_prefill_backend.py @@ -17,7 +17,7 @@ from vllm_ascend.utils import vllm_version_is -if not vllm_version_is("0.20.1"): +if not vllm_version_is("0.20.2"): from vllm.v1.attention.backends.mla.prefill.base import MLAPrefillBackend class AscendMLAPrefillBackend(MLAPrefillBackend): diff --git a/vllm_ascend/patch/worker/patch_v2/patch_triton.py b/vllm_ascend/patch/worker/patch_v2/patch_triton.py index a910846d3a0..420b5b56298 100644 --- a/vllm_ascend/patch/worker/patch_v2/patch_triton.py +++ b/vllm_ascend/patch/worker/patch_v2/patch_triton.py @@ -29,7 +29,7 @@ logprob.compute_token_logprobs = compute_token_logprobs structured_outputs._apply_grammar_bitmask_kernel = _apply_grammar_bitmask_kernel -if not vllm_version_is("0.20.1"): +if not vllm_version_is("0.20.2"): from vllm_ascend.worker.v2.spec_decode.probabilistic_rejection_sampler_utils import ( probabilistic_rejection_sample as npu_probabilistic_rejection_sample, ) diff --git a/vllm_ascend/worker/v2/sample/logprob.py b/vllm_ascend/worker/v2/sample/logprob.py index e2fabd5fad1..9d275fe5776 100644 --- a/vllm_ascend/worker/v2/sample/logprob.py +++ b/vllm_ascend/worker/v2/sample/logprob.py @@ -24,7 +24,7 @@ from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num from vllm_ascend.utils import vllm_version_is -if not vllm_version_is("0.20.1"): +if not vllm_version_is("0.20.2"): from vllm.v1.worker.gpu.sample.logprob import LogprobTokenIdsState From b717ac5e2215e3e22570271c6a59f168d6fd69b6 Mon Sep 17 00:00:00 2001 From: wangli Date: Tue, 12 May 2026 19:21:37 +0800 Subject: [PATCH 05/19] fix weight load error Signed-off-by: wangli --- vllm_ascend/patch/worker/patch_deepseek_mtp.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/patch/worker/patch_deepseek_mtp.py b/vllm_ascend/patch/worker/patch_deepseek_mtp.py index bc147d74858..ec763569ec0 100644 --- a/vllm_ascend/patch/worker/patch_deepseek_mtp.py +++ b/vllm_ascend/patch/worker/patch_deepseek_mtp.py @@ -12,7 +12,11 @@ def get_spec_layer_idx_from_weight_name(config: DeepseekV2Config | DeepseekV3Con if hasattr(config, "num_nextn_predict_layers") and config.num_nextn_predict_layers > 0: layer_idx = config.num_hidden_layers for i in range(config.num_nextn_predict_layers): - if weight_name.startswith(f"model.layers.{layer_idx + i}.") or weight_name.startswith(MTP_ROT_WEIGHT_NAME): + if ( + weight_name.startswith(f"models.layers.{layer_idx + i}.") + or weight_name.startswith(MTP_ROT_WEIGHT_NAME) + or weight_name.startswith(f"layers.{layer_idx + i}.") + ): return layer_idx + i return None From 018d75b9456299635b4cde4a5abede768ced020f Mon Sep 17 00:00:00 2001 From: wangli Date: Tue, 12 May 2026 19:38:55 +0800 Subject: [PATCH 06/19] fix Signed-off-by: wangli --- .../patch/worker/patch_deepseek_mtp.py | 24 +++++++++++++++++-- vllm_ascend/worker/v2/block_table.py | 4 ++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/vllm_ascend/patch/worker/patch_deepseek_mtp.py b/vllm_ascend/patch/worker/patch_deepseek_mtp.py index ec763569ec0..bce4e826efd 100644 --- a/vllm_ascend/patch/worker/patch_deepseek_mtp.py +++ b/vllm_ascend/patch/worker/patch_deepseek_mtp.py @@ -5,6 +5,8 @@ from vllm.config import VllmConfig from vllm.model_executor.models.deepseek_mtp import DeepSeekMTP, DeepSeekMultiTokenPredictorLayer +from vllm_ascend.utils import vllm_version_is + MTP_ROT_WEIGHT_NAME = "rot.weight" @@ -21,6 +23,17 @@ def get_spec_layer_idx_from_weight_name(config: DeepseekV2Config | DeepseekV3Con return None +def get_spec_layer_idx_from_weight_name_020( + config: DeepseekV2Config | DeepseekV3Config, weight_name: str +) -> int | None: + if hasattr(config, "num_nextn_predict_layers") and config.num_nextn_predict_layers > 0: + layer_idx = config.num_hidden_layers + for i in range(config.num_nextn_predict_layers): + if weight_name.startswith(f"models.layers.{layer_idx + i}.") or weight_name.startswith(MTP_ROT_WEIGHT_NAME): + return layer_idx + i + return None + + class AscendDeepSeekMultiTokenPredictorLayer(DeepSeekMultiTokenPredictorLayer): def __init__(self, vllm_config: VllmConfig, prefix: str) -> None: super().__init__(vllm_config, prefix) @@ -61,7 +74,14 @@ def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str: return f"model.layers.{spec_layer}.rot.weight" -vllm.model_executor.models.deepseek_v2.get_spec_layer_idx_from_weight_name = get_spec_layer_idx_from_weight_name -vllm.model_executor.models.deepseek_mtp.get_spec_layer_idx_from_weight_name = get_spec_layer_idx_from_weight_name +if vllm_version_is("0.20.2"): + vllm.model_executor.models.deepseek_v2.get_spec_layer_idx_from_weight_name = get_spec_layer_idx_from_weight_name_020 + vllm.model_executor.models.deepseek_mtp.get_spec_layer_idx_from_weight_name = ( + get_spec_layer_idx_from_weight_name_020 + ) +else: + vllm.model_executor.models.deepseek_v2.get_spec_layer_idx_from_weight_name = get_spec_layer_idx_from_weight_name + vllm.model_executor.models.deepseek_mtp.get_spec_layer_idx_from_weight_name = get_spec_layer_idx_from_weight_name + vllm.model_executor.models.deepseek_mtp.DeepSeekMultiTokenPredictorLayer = AscendDeepSeekMultiTokenPredictorLayer vllm.model_executor.models.deepseek_mtp.DeepSeekMTP = AscendDeepSeekMTP diff --git a/vllm_ascend/worker/v2/block_table.py b/vllm_ascend/worker/v2/block_table.py index 8ce0d294b1a..94c7dd73dca 100644 --- a/vllm_ascend/worker/v2/block_table.py +++ b/vllm_ascend/worker/v2/block_table.py @@ -31,7 +31,7 @@ def __init__( block_sizes: list[int], max_num_reqs: int, max_num_batched_tokens: int, - max_model_len: int, + max_num_blocks_per_group: list[int], device: torch.device, cp_size: int = 1, cp_rank: int = 0, @@ -41,7 +41,7 @@ def __init__( block_sizes, max_num_reqs, max_num_batched_tokens, - max_model_len, + max_num_blocks_per_group, device, cp_size, cp_rank, From f68a9fb2ef4a0b3a66739a3c077f27ff97f873a9 Mon Sep 17 00:00:00 2001 From: wangli Date: Tue, 12 May 2026 20:03:58 +0800 Subject: [PATCH 07/19] adapt mdrnV2 Signed-off-by: wangli --- vllm_ascend/worker/v2/attn_utils.py | 29 ++++++++++++++++--- vllm_ascend/worker/v2/model_runner.py | 14 ++++++--- vllm_ascend/worker/v2/model_states/default.py | 1 + 3 files changed, 36 insertions(+), 8 deletions(-) diff --git a/vllm_ascend/worker/v2/attn_utils.py b/vllm_ascend/worker/v2/attn_utils.py index 71c3efccae4..fbc4c68794a 100644 --- a/vllm_ascend/worker/v2/attn_utils.py +++ b/vllm_ascend/worker/v2/attn_utils.py @@ -34,6 +34,7 @@ MLAAttentionSpec, UniformTypeKVCacheSpecs, ) +from vllm.v1.worker.gpu.model_states.interface import ModelSpecificAttnMetadata from vllm.v1.worker.utils import AttentionGroup from vllm_ascend.attention.attention_mask import AttentionMaskBuilder @@ -75,6 +76,8 @@ def build_attn_metadata( graph_pad_size: int = -1, num_input_tokens: int = 0, prefill_context_parallel_metadata: AscendPrefillContextParallelMetadata | None = None, + model_specific_attn_metadata: ModelSpecificAttnMetadata | None = None, + for_cudagraph_capture: bool = False, ) -> dict[str, Any]: """Build attention metadata for Ascend NPUs.""" # TODO(Ronald1995): optimize AscendCommonAttentionMetadata. @@ -92,6 +95,11 @@ def build_attn_metadata( block_table = block_tables[i] slot_mapping = slot_mappings[i] + common_attn_metadata_extra_kwargs = ( + model_specific_attn_metadata.get_extra_common_attn_kwargs(i, num_reqs) + if model_specific_attn_metadata is not None + else {} + ) common_attn_metadata = AscendCommonAttentionMetadata( query_start_loc=query_start_loc_gpu, query_start_loc_cpu=query_start_loc_cpu, @@ -109,14 +117,27 @@ def build_attn_metadata( num_input_tokens=num_input_tokens, prefill_context_parallel_metadata=prefill_context_parallel_metadata, max_seq_len=max_seq_len, + **common_attn_metadata_extra_kwargs, ) for attn_group in attn_groups[i]: attn_metadata_builder = attn_group.get_metadata_builder(0) - metadata = attn_metadata_builder.build( - common_prefix_len=0, - common_attn_metadata=common_attn_metadata, - ) + if for_cudagraph_capture: + metadata = attn_metadata_builder.build_for_cudagraph_capture(common_attn_metadata) + else: + attn_metadata_extra_kwargs = ( + model_specific_attn_metadata.get_extra_attn_kwargs( + attn_metadata_builder, + num_reqs, + ) + if model_specific_attn_metadata is not None + else {} + ) + metadata = attn_metadata_builder.build( + common_prefix_len=0, + common_attn_metadata=common_attn_metadata, + **attn_metadata_extra_kwargs, + ) for layer_name in attn_group.layer_names: attn_metadata[layer_name] = metadata return attn_metadata diff --git a/vllm_ascend/worker/v2/model_runner.py b/vllm_ascend/worker/v2/model_runner.py index a2903dca9c4..f09be6a5e63 100644 --- a/vllm_ascend/worker/v2/model_runner.py +++ b/vllm_ascend/worker/v2/model_runner.py @@ -208,6 +208,7 @@ def prepare_inputs( # Get the number of draft tokens for each request. draft_tokens = scheduler_output.scheduled_spec_decode_tokens + num_draft_tokens_per_req: np.ndarray | None = None if not draft_tokens: # No draft token scheduled (common case). total_num_draft_tokens = 0 @@ -217,14 +218,14 @@ def prepare_inputs( expanded_idx_mapping = idx_mapping expanded_local_pos = torch.zeros(num_reqs, dtype=torch.int32, device=self.device) else: - num_draft_tokens = np.array( + num_draft_tokens_per_req = np.array( [len(draft_tokens.get(req_id, ())) for req_id in req_ids], dtype=np.int32, ) - total_num_draft_tokens = int(num_draft_tokens.sum()) + total_num_draft_tokens = int(num_draft_tokens_per_req.sum()) total_num_logits = num_reqs + total_num_draft_tokens - num_logits = num_draft_tokens + 1 + num_logits = num_draft_tokens_per_req + 1 cu_num_logits_np = np.empty(num_reqs + 1, dtype=np.int32) cu_num_logits_np[0] = 0 np.cumsum(num_logits, out=cu_num_logits_np[1:]) @@ -261,9 +262,12 @@ def prepare_inputs( query_start_loc_np = query_start_loc_np[: num_reqs_padded + 1] query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1] + is_prefilling_np = ( + self.req_states.num_computed_prefill_tokens[idx_mapping_np] < self.req_states.prefill_len.np[idx_mapping_np] + ) # Get prefill tokens if any. - if self.req_states.any_prefills(idx_mapping_np): + if np.any(is_prefilling_np): prepare_prefill_inputs( self.input_buffers.input_ids, self.req_states.next_prefill_tokens, @@ -326,11 +330,13 @@ def prepare_inputs( num_tokens=num_tokens, num_tokens_after_padding=num_tokens_after_padding, num_draft_tokens=total_num_draft_tokens, + num_draft_tokens_per_req=num_draft_tokens_per_req, query_start_loc=query_start_loc, query_start_loc_np=query_start_loc_np, seq_lens=seq_lens, seq_lens_cpu_upper_bound=seq_lens_cpu_upper_bound, dcp_local_seq_lens=None, # TODO(Ronald1995): support cp. + is_prefilling_np=is_prefilling_np, input_ids=input_ids, positions=positions, logits_indices=logits_indices, diff --git a/vllm_ascend/worker/v2/model_states/default.py b/vllm_ascend/worker/v2/model_states/default.py index 90546de5cf1..9dd9d888ead 100644 --- a/vllm_ascend/worker/v2/model_states/default.py +++ b/vllm_ascend/worker/v2/model_states/default.py @@ -72,5 +72,6 @@ def prepare_attn( seq_lens_np=input_batch.seq_lens_np, positions=input_batch.positions, attn_state=input_batch.attn_state, + for_cudagraph_capture=for_capture, ) return self.attn_metadata From 72d571e9b1fd1998838ffb321c7adaa9a05cb8b3 Mon Sep 17 00:00:00 2001 From: wangli Date: Tue, 12 May 2026 23:40:24 +0800 Subject: [PATCH 08/19] fix patch Signed-off-by: wangli --- vllm_ascend/patch/worker/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index 38dd6fcde8e..d5827935510 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -17,14 +17,13 @@ from vllm.triton_utils import HAS_TRITON -from vllm_ascend.utils import is_310p +from vllm_ascend.utils import is_310p, vllm_version_is if HAS_TRITON: import vllm_ascend.patch.worker.patch_triton import vllm_ascend.patch.worker.patch_v2.patch_triton # noqa -# isort: off import vllm_ascend.patch.worker.patch_weight_utils # noqa import vllm_ascend.patch.platform.patch_sched_yield # noqa import vllm_ascend.patch.worker.patch_bert # noqa @@ -53,4 +52,6 @@ import vllm_ascend.patch.worker.patch_v2.patch_model_state # noqa import vllm_ascend.patch.worker.patch_v2.patch_block_table # noqa import vllm_ascend.patch.worker.patch_gqa_c8 # noqa -import vllm_ascend.patch.worker.patch_v2.patch_attn_utils # noqa + +if not vllm_version_is("0.20.2"): + import vllm_ascend.patch.worker.patch_v2.patch_attn_utils # noqa From d3a30f6ea5571d06a92fd32bffed94ade16009fe Mon Sep 17 00:00:00 2001 From: wangli Date: Tue, 12 May 2026 23:39:01 +0800 Subject: [PATCH 09/19] fix Signed-off-by: wangli --- vllm_ascend/worker/model_runner_v1.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 316f0a06047..ac9dfd0278d 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2273,8 +2273,11 @@ def _pad_for_sequence_parallelism(self, num_scheduled_tokens: int) -> int: return round_up(num_scheduled_tokens, tp_size) return num_scheduled_tokens - # This is a function from the upstream vllm used to handle PP+SP. Since the judgment logic - # of flashcomm1 in Ascend is inconsistent with SP in vllm, it needs to be overridden. + # These functions from upstream vllm handle PP+SP. Ascend's flashcomm1 SP + # differs from vllm's native SP: flashcomm1 does NOT scatter the residual + # before PP send, so the all_gather in sync_and_gather_intermediate_tensors + # must be skipped. Both overrides use enable_sp() rather than + # is_residual_scattered_for_sp() to reflect the actual Ascend SP state. def sync_and_slice_intermediate_tensors( self, num_tokens: int, @@ -2284,8 +2287,6 @@ def sync_and_slice_intermediate_tensors( assert self.intermediate_tensors is not None tp = self.vllm_config.parallel_config.tensor_parallel_size - # When sequence parallelism is enabled, the "residual" tensor is sharded - # across tensor parallel ranks, so each rank only needs its own slice. if sync_self: assert intermediate_tensors is not None for k, v in intermediate_tensors.items(): @@ -2303,6 +2304,19 @@ def sync_and_slice_intermediate_tensors( } ) + def sync_and_gather_intermediate_tensors( + self, + num_tokens: int, + intermediate_tensors: IntermediateTensors | None, + sync_self: bool, + ) -> IntermediateTensors: + # vllm renamed sync_and_slice to sync_and_gather in v0.20.2. + # The Ascend override logic is identical: skip the upstream all_gather + # (flashcomm1 does not scatter residual before PP send). + return self.sync_and_slice_intermediate_tensors( + num_tokens, intermediate_tensors, sync_self + ) + def _determine_batch_execution_and_padding( self, num_tokens: int, From 86dac634b418121fda4727104fc6ede03751970a Mon Sep 17 00:00:00 2001 From: wangli Date: Wed, 13 May 2026 10:34:05 +0800 Subject: [PATCH 10/19] fix gdn Signed-off-by: wangli --- vllm_ascend/ops/gdn.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/vllm_ascend/ops/gdn.py b/vllm_ascend/ops/gdn.py index 5254d7bc27a..10dfc6495a0 100644 --- a/vllm_ascend/ops/gdn.py +++ b/vllm_ascend/ops/gdn.py @@ -32,6 +32,7 @@ from vllm_ascend.ops.triton.fla.utils import clear_ssm_states from vllm_ascend.ops.triton.fused_gdn_gating import fused_gdn_gating_patch from vllm_ascend.ops.triton.mamba.causal_conv1d import causal_conv1d_update_npu +from vllm_ascend.utils import vllm_version_is def to_int64_tuple(tensor: torch.Tensor) -> tuple[int, ...]: @@ -124,14 +125,23 @@ def forward( device=hidden_states.device, ) - torch.ops.vllm.gdn_attention_core( - mixed_qkv, - b, - a, - core_attn_out, - False, - self.prefix, - ) + if vllm_version_is("0.20.2"): + torch.ops.vllm.gdn_attention_core( + mixed_qkv, + b, + a, + core_attn_out, + self.prefix, + ) + else: + torch.ops.vllm.gdn_attention_core( + mixed_qkv, + b, + a, + core_attn_out, + False, + self.prefix, + ) # ============================================================ # Part 3: Output Projection From a4cee58053b3224d19ad3d217ded25632952d345 Mon Sep 17 00:00:00 2001 From: wangli Date: Wed, 13 May 2026 11:18:07 +0800 Subject: [PATCH 11/19] fix patch Signed-off-by: wangli --- vllm_ascend/patch/worker/patch_deepseek_mtp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/patch/worker/patch_deepseek_mtp.py b/vllm_ascend/patch/worker/patch_deepseek_mtp.py index bce4e826efd..696d3e55127 100644 --- a/vllm_ascend/patch/worker/patch_deepseek_mtp.py +++ b/vllm_ascend/patch/worker/patch_deepseek_mtp.py @@ -15,7 +15,7 @@ def get_spec_layer_idx_from_weight_name(config: DeepseekV2Config | DeepseekV3Con layer_idx = config.num_hidden_layers for i in range(config.num_nextn_predict_layers): if ( - weight_name.startswith(f"models.layers.{layer_idx + i}.") + weight_name.startswith(f"model.layers.{layer_idx + i}.") or weight_name.startswith(MTP_ROT_WEIGHT_NAME) or weight_name.startswith(f"layers.{layer_idx + i}.") ): @@ -29,7 +29,7 @@ def get_spec_layer_idx_from_weight_name_020( if hasattr(config, "num_nextn_predict_layers") and config.num_nextn_predict_layers > 0: layer_idx = config.num_hidden_layers for i in range(config.num_nextn_predict_layers): - if weight_name.startswith(f"models.layers.{layer_idx + i}.") or weight_name.startswith(MTP_ROT_WEIGHT_NAME): + if weight_name.startswith(f"model.layers.{layer_idx + i}.") or weight_name.startswith(MTP_ROT_WEIGHT_NAME): return layer_idx + i return None From b95534af97f9bb1bc25d27c23c949e881bca5dc0 Mon Sep 17 00:00:00 2001 From: wangli Date: Wed, 13 May 2026 11:32:00 +0800 Subject: [PATCH 12/19] fix lint Signed-off-by: wangli --- vllm_ascend/worker/v2/model_runner.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/worker/v2/model_runner.py b/vllm_ascend/worker/v2/model_runner.py index f09be6a5e63..2236b76db8f 100644 --- a/vllm_ascend/worker/v2/model_runner.py +++ b/vllm_ascend/worker/v2/model_runner.py @@ -218,14 +218,15 @@ def prepare_inputs( expanded_idx_mapping = idx_mapping expanded_local_pos = torch.zeros(num_reqs, dtype=torch.int32, device=self.device) else: - num_draft_tokens_per_req = np.array( + num_draft_tokens_arr = np.array( [len(draft_tokens.get(req_id, ())) for req_id in req_ids], dtype=np.int32, ) - total_num_draft_tokens = int(num_draft_tokens_per_req.sum()) + num_draft_tokens_per_req = num_draft_tokens_arr + total_num_draft_tokens = int(num_draft_tokens_arr.sum()) total_num_logits = num_reqs + total_num_draft_tokens - num_logits = num_draft_tokens_per_req + 1 + num_logits = num_draft_tokens_arr + 1 cu_num_logits_np = np.empty(num_reqs + 1, dtype=np.int32) cu_num_logits_np[0] = 0 np.cumsum(num_logits, out=cu_num_logits_np[1:]) From a0b93d5d21d919abd702726133a38de863c9fb37 Mon Sep 17 00:00:00 2001 From: wangli Date: Wed, 13 May 2026 14:49:37 +0800 Subject: [PATCH 13/19] disable async schedule when enable_return_routed_experts Signed-off-by: wangli --- tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py b/tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py index 97f90698bc0..b22d3504a10 100644 --- a/tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py +++ b/tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py @@ -19,6 +19,7 @@ def test_qwen3_moe_routing_replay(): cudagraph_capture_sizes=[1, 2, 4, 8], distributed_executor_backend="mp", enable_return_routed_experts=True, + async_scheduling=False, ) as vllm_model: sampling_params = SamplingParams( max_tokens=5, temperature=0.8, top_p=0.95, output_kind=RequestOutputKind.FINAL_ONLY From 0396b2f9c5f891182f0cde9fe764201535df75c5 Mon Sep 17 00:00:00 2001 From: wangli Date: Wed, 13 May 2026 15:30:06 +0800 Subject: [PATCH 14/19] async D2H pipeline for routed expert replay Signed-off-by: wangli --- vllm_ascend/ops/fused_moe/fused_moe.py | 14 +- .../ops/fused_moe/routed_experts_compat.py | 126 ++++++++++++++++++ vllm_ascend/worker/model_runner_v1.py | 29 ++-- 3 files changed, 151 insertions(+), 18 deletions(-) create mode 100644 vllm_ascend/ops/fused_moe/routed_experts_compat.py diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index 4f1c5eebcec..46730a98366 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -27,7 +27,6 @@ from vllm.logger import logger from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map -from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsCapturer from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner # type: ignore import vllm_ascend.envs as envs_ascend @@ -36,6 +35,7 @@ from vllm_ascend.distributed.parallel_state import get_mc2_group from vllm_ascend.eplb.core.eplb_utils import init_eplb_config from vllm_ascend.flash_common3_context import get_flash_common3_context, set_flash_common3_context +from vllm_ascend.ops.fused_moe import routed_experts_compat from vllm_ascend.ops.fused_moe.experts_selector import select_experts, zero_experts_compute from vllm_ascend.ops.fused_moe.moe_comm_method import AllGatherCommImpl, FusedExpertsResult, setup_moe_comm_method from vllm_ascend.ops.fused_moe.moe_runtime_args import build_fused_experts_input @@ -159,12 +159,12 @@ def apply( num_experts=num_logical_experts, ) if layer.vllm_config.model_config is not None and layer.vllm_config.model_config.enable_return_routed_experts: - capturer = RoutedExpertsCapturer.get_instance() - if capturer is not None: - capturer.capture( - layer_id=layer.layer_id, - topk_ids=topk_ids, - ) + capturer = routed_experts_compat.get_capturer() + routed_experts_compat.call_capture( + capturer, + layer_id=layer.layer_id, + topk_ids=topk_ids, + ) if zero_expert_num > 0 and zero_expert_type is not None: topk_ids, topk_weights, zero_expert_result = zero_experts_compute( diff --git a/vllm_ascend/ops/fused_moe/routed_experts_compat.py b/vllm_ascend/ops/fused_moe/routed_experts_compat.py new file mode 100644 index 00000000000..7f103134749 --- /dev/null +++ b/vllm_ascend/ops/fused_moe/routed_experts_compat.py @@ -0,0 +1,126 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +"""Compatibility shim around vLLM's RoutedExpertsCapturer. + +vLLM PR #39917 (post-0.20.2) reworked the routed-experts pipeline: +- 0.20.2 exposed `RoutedExpertsCapturer.get_instance()` plus + `clear_buffer()` / `save_captured_experts(indices=...)` methods. +- main moved to module-level helpers (`get_global_experts_capturer`, + `issue_routing_d2h_copy`, `extract_routed_experts_for_current_batch`, + `free_routing_buffers`, `init_routed_experts_capturer_with_shared_cache`). + +We support exactly two vLLM versions: 0.20.2 (old API) and main (new +API). Selection is via `vllm_version_is("0.20.2")` -- explicit, no +runtime symbol probing. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np +import torch +from vllm.model_executor.layers.fused_moe import routed_experts_capturer as _rec + +from vllm_ascend.utils import vllm_version_is + +if TYPE_CHECKING: + from vllm.v1.core.sched.output import SchedulerOutput + +USE_LEGACY_API = vllm_version_is("0.20.2") + + +def get_capturer(): + """Return the global capturer instance, or None if not initialized.""" + if USE_LEGACY_API: + return _rec.RoutedExpertsCapturer.get_instance() + return _rec.get_global_experts_capturer() + + +def clear_step_buffers(scheduler_output: SchedulerOutput) -> None: + """Free per-request routing buffers for finished/preempted reqs. + + main: `free_routing_buffers(finished, preempted)`. + 0.20.2: `capturer.clear_buffer()` (full-buffer reset). + """ + if USE_LEGACY_API: + capturer = get_capturer() + if capturer is not None: + capturer.clear_buffer() + return + + _rec.free_routing_buffers( + scheduler_output.finished_req_ids, + getattr(scheduler_output, "preempted_req_ids", None), + ) + + +def issue_d2h_copy( + *, + input_batch_req_ids: list[str], + num_scheduled_tokens: dict[str, int], + positions: torch.Tensor, + positions_cpu: torch.Tensor | None, + legacy_indices: torch.Tensor | None = None, +) -> None: + """Trigger the per-step D2H copy of routed experts. + + main: `issue_routing_d2h_copy(...)` (async copy). + 0.20.2: `capturer.save_captured_experts(indices=legacy_indices)`. + """ + if USE_LEGACY_API: + capturer = get_capturer() + if capturer is not None: + capturer.save_captured_experts(indices=legacy_indices) + return + + _rec.issue_routing_d2h_copy( + input_batch_req_ids=input_batch_req_ids, + num_scheduled_tokens=num_scheduled_tokens, + positions=positions, + positions_cpu=positions_cpu, + ) + + +def extract_for_current_batch( + *, + req_ids: list[str], + requests: dict, + req_id_to_index: dict[str, int], + num_tokens_no_spec: np.ndarray, + max_model_len: int, +) -> dict[str, np.ndarray] | None: + """Pull routing data for requests finishing this step. + + main: `extract_routed_experts_for_current_batch(...)`. + 0.20.2: routing data flows through a different channel inside + `save_captured_experts`, so this returns None. + """ + if USE_LEGACY_API: + return None + return _rec.extract_routed_experts_for_current_batch( + req_ids=req_ids, + requests=requests, + req_id_to_index=req_id_to_index, + num_tokens_no_spec=num_tokens_no_spec, + max_model_len=max_model_len, + ) + + +def call_capture(capturer, *, layer_id: int, topk_ids: torch.Tensor) -> None: + """Invoke `.capture(...)` on a capturer instance. + + Both 0.20.2 and main expose `capture(layer_id, topk_ids)`, so this + is a thin pass-through kept for symmetry with the other helpers. + """ + if capturer is None: + return + capturer.capture(layer_id=layer_id, topk_ids=topk_ids) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index ac9dfd0278d..13d42c546da 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -150,7 +150,7 @@ set_mc2_mask, set_mc2_tokens_capacity, ) -from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsCapturer +from vllm_ascend.ops.fused_moe import routed_experts_compat if TYPE_CHECKING: import xgrammar as xgr # type: ignore[import-untyped] @@ -1498,11 +1498,7 @@ def execute_model( intermediate_tensors: IntermediateTensors | None = None, ) -> ModelRunnerOutput | IntermediateTensors | None: if self.vllm_config.model_config.enable_return_routed_experts: - capturer = RoutedExpertsCapturer.get_instance() - if capturer is not None: - capturer.clear_buffer() - else: - logger.warning("RoutedExpertsCapturer is not initialized.") + routed_experts_compat.clear_step_buffers(scheduler_output) if self.ascend_config.profiling_chunk_config.need_timing: # Check if the scheduler signaled that calibration is complete. @@ -1968,12 +1964,22 @@ def propose_draft_token_ids(sampled_token_ids): if self.speculative_config is not None: self.finalize_kv_connector() + routed_experts_dict = None if self.model_config.enable_return_routed_experts: - capturer = RoutedExpertsCapturer.get_instance() - if capturer is not None: - capturer.save_captured_experts(indices=self.cpu_slot_mapping) - else: - logger.warning("RoutedExpertsCapturer is not initialized.") + routed_experts_compat.issue_d2h_copy( + input_batch_req_ids=self.input_batch.req_ids, + num_scheduled_tokens=scheduler_output.num_scheduled_tokens, + positions=self.positions, + positions_cpu=getattr(self, "_positions_cpu", None), + legacy_indices=self.cpu_slot_mapping, + ) + routed_experts_dict = routed_experts_compat.extract_for_current_batch( + req_ids=req_ids_output_copy, + requests=self.requests, + req_id_to_index=self.input_batch.req_id_to_index, + num_tokens_no_spec=self.input_batch.num_tokens_no_spec, + max_model_len=self.max_model_len, + ) model_runner_output = ModelRunnerOutput( req_ids=req_ids_output_copy, @@ -1985,6 +1991,7 @@ def propose_draft_token_ids(sampled_token_ids): pooler_output=[], ec_connector_output=ec_connector_output if self.supports_mm_inputs else None, cudagraph_stats=cudagraph_stats, + routed_experts_dict=routed_experts_dict, ) if self.ascend_config.profiling_chunk_config.need_timing and hasattr(self, '_execution_start_time'): self._sync_device() From 0ac534a80062348822f01ad83b60256aec7d54aa Mon Sep 17 00:00:00 2001 From: wangli Date: Thu, 14 May 2026 09:16:20 +0800 Subject: [PATCH 15/19] adapt for 0202 Signed-off-by: wangli --- vllm_ascend/worker/model_runner_v1.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 13d42c546da..4fe84c6f4a8 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -138,6 +138,7 @@ lmhead_tp_enable, set_weight_prefetch_method, should_skip_allreduce_across_dp_group, + vllm_version_is, ) from vllm_ascend.worker.npu_input_batch import NPUInputBatch from vllm_ascend.worker.pcp_utils import PCPManager @@ -1991,7 +1992,9 @@ def propose_draft_token_ids(sampled_token_ids): pooler_output=[], ec_connector_output=ec_connector_output if self.supports_mm_inputs else None, cudagraph_stats=cudagraph_stats, - routed_experts_dict=routed_experts_dict, + **( + {} if vllm_version_is("0.20.2") else {"routed_experts_dict": routed_experts_dict} + ), ) if self.ascend_config.profiling_chunk_config.need_timing and hasattr(self, '_execution_start_time'): self._sync_device() From 541a61cc20658932b7465346bf4074865b6446d7 Mon Sep 17 00:00:00 2001 From: wangli Date: Thu, 14 May 2026 11:49:41 +0800 Subject: [PATCH 16/19] tiny fix Signed-off-by: wangli --- vllm_ascend/ops/fused_moe/routed_experts_compat.py | 6 ------ vllm_ascend/worker/v2/attn_utils.py | 6 ++++-- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/vllm_ascend/ops/fused_moe/routed_experts_compat.py b/vllm_ascend/ops/fused_moe/routed_experts_compat.py index 7f103134749..2a01033b3d7 100644 --- a/vllm_ascend/ops/fused_moe/routed_experts_compat.py +++ b/vllm_ascend/ops/fused_moe/routed_experts_compat.py @@ -9,17 +9,11 @@ # http://www.apache.org/licenses/LICENSE-2.0 # """Compatibility shim around vLLM's RoutedExpertsCapturer. - -vLLM PR #39917 (post-0.20.2) reworked the routed-experts pipeline: - 0.20.2 exposed `RoutedExpertsCapturer.get_instance()` plus `clear_buffer()` / `save_captured_experts(indices=...)` methods. - main moved to module-level helpers (`get_global_experts_capturer`, `issue_routing_d2h_copy`, `extract_routed_experts_for_current_batch`, `free_routing_buffers`, `init_routed_experts_capturer_with_shared_cache`). - -We support exactly two vLLM versions: 0.20.2 (old API) and main (new -API). Selection is via `vllm_version_is("0.20.2")` -- explicit, no -runtime symbol probing. """ from __future__ import annotations diff --git a/vllm_ascend/worker/v2/attn_utils.py b/vllm_ascend/worker/v2/attn_utils.py index fbc4c68794a..ceda15a074c 100644 --- a/vllm_ascend/worker/v2/attn_utils.py +++ b/vllm_ascend/worker/v2/attn_utils.py @@ -34,14 +34,16 @@ MLAAttentionSpec, UniformTypeKVCacheSpecs, ) -from vllm.v1.worker.gpu.model_states.interface import ModelSpecificAttnMetadata from vllm.v1.worker.utils import AttentionGroup from vllm_ascend.attention.attention_mask import AttentionMaskBuilder from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.attention.utils import AscendCommonAttentionMetadata, AscendPrefillContextParallelMetadata from vllm_ascend.quantization.utils import enable_fa_quant -from vllm_ascend.utils import calc_split_factor +from vllm_ascend.utils import calc_split_factor, vllm_version_is + +if vllm_version_is("0.20.2"): + from vllm.v1.worker.gpu.model_states.interface import ModelSpecificAttnMetadata _ATTENTION_MASK_BUILDER = None From 8ae2bfc8a3449a63b6b7f40a4b34d5a8959150b2 Mon Sep 17 00:00:00 2001 From: wangli Date: Thu, 14 May 2026 12:35:45 +0800 Subject: [PATCH 17/19] fix Signed-off-by: wangli --- vllm_ascend/worker/v2/attn_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/worker/v2/attn_utils.py b/vllm_ascend/worker/v2/attn_utils.py index ceda15a074c..d2fd0f2e0d0 100644 --- a/vllm_ascend/worker/v2/attn_utils.py +++ b/vllm_ascend/worker/v2/attn_utils.py @@ -42,7 +42,7 @@ from vllm_ascend.quantization.utils import enable_fa_quant from vllm_ascend.utils import calc_split_factor, vllm_version_is -if vllm_version_is("0.20.2"): +if not vllm_version_is("0.20.2"): from vllm.v1.worker.gpu.model_states.interface import ModelSpecificAttnMetadata _ATTENTION_MASK_BUILDER = None From 9c4576e235f5e503d56ce4b5c7e917324bfa0205 Mon Sep 17 00:00:00 2001 From: wangli Date: Thu, 14 May 2026 12:36:38 +0800 Subject: [PATCH 18/19] fix Signed-off-by: wangli --- .github/workflows/_e2e_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 39bf9f1966f..eac9ea73826 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -27,7 +27,7 @@ on: continue_on_error: required: false type: boolean - default: true + default: false # The following inputs are used by comment-triggered E2E tests (/e2e ). # They carry space-separated pytest paths, categorized by runner type. # Leave empty (default) when running label-triggered full/light suites. From 6cdb1aded3b7898f84ae5054a8803efb606e6e5d Mon Sep 17 00:00:00 2001 From: wangli Date: Thu, 14 May 2026 15:14:15 +0800 Subject: [PATCH 19/19] remove mdrnv2 for v0.20.2 Signed-off-by: wangli --- vllm_ascend/patch/worker/__init__.py | 17 +++++++++++------ vllm_ascend/worker/v2/attn_utils.py | 6 ++---- vllm_ascend/worker/v2/sample/logprob.py | 7 ++----- vllm_ascend/worker/worker.py | 4 ++++ 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index d5827935510..6dab60c7701 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -19,9 +19,14 @@ from vllm_ascend.utils import is_310p, vllm_version_is +# v2 model runner is only supported on vllm > 0.20.2. +_V2_MODEL_RUNNER_SUPPORTED = not vllm_version_is("0.20.2") + if HAS_TRITON: import vllm_ascend.patch.worker.patch_triton - import vllm_ascend.patch.worker.patch_v2.patch_triton # noqa + + if _V2_MODEL_RUNNER_SUPPORTED: + import vllm_ascend.patch.worker.patch_v2.patch_triton # noqa import vllm_ascend.patch.worker.patch_weight_utils # noqa @@ -41,17 +46,17 @@ else: import vllm_ascend.patch.worker.patch_idex_310 # noqa import vllm_ascend.patch.worker.patch_rejection_sampler # noqa -import vllm_ascend.patch.worker.patch_v2.patch_uva # noqa import vllm_ascend.patch.worker.patch_huanyuan_vl # noqa import vllm_ascend.patch.worker.patch_npugraph_ex_triton # noqa import vllm_ascend.patch.worker.patch_kimi_k25 # noqa import vllm_ascend.patch.worker.patch_draft_quarot # noqa import vllm_ascend.patch.worker.patch_cudagraph # noqa import vllm_ascend.patch.worker.patch_deepseek_mtp # noqa -import vllm_ascend.patch.worker.patch_v2.patch_input_batch # noqa -import vllm_ascend.patch.worker.patch_v2.patch_model_state # noqa -import vllm_ascend.patch.worker.patch_v2.patch_block_table # noqa import vllm_ascend.patch.worker.patch_gqa_c8 # noqa -if not vllm_version_is("0.20.2"): +if _V2_MODEL_RUNNER_SUPPORTED: + import vllm_ascend.patch.worker.patch_v2.patch_uva # noqa + import vllm_ascend.patch.worker.patch_v2.patch_input_batch # noqa + import vllm_ascend.patch.worker.patch_v2.patch_model_state # noqa + import vllm_ascend.patch.worker.patch_v2.patch_block_table # noqa import vllm_ascend.patch.worker.patch_v2.patch_attn_utils # noqa diff --git a/vllm_ascend/worker/v2/attn_utils.py b/vllm_ascend/worker/v2/attn_utils.py index d2fd0f2e0d0..fbc4c68794a 100644 --- a/vllm_ascend/worker/v2/attn_utils.py +++ b/vllm_ascend/worker/v2/attn_utils.py @@ -34,16 +34,14 @@ MLAAttentionSpec, UniformTypeKVCacheSpecs, ) +from vllm.v1.worker.gpu.model_states.interface import ModelSpecificAttnMetadata from vllm.v1.worker.utils import AttentionGroup from vllm_ascend.attention.attention_mask import AttentionMaskBuilder from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.attention.utils import AscendCommonAttentionMetadata, AscendPrefillContextParallelMetadata from vllm_ascend.quantization.utils import enable_fa_quant -from vllm_ascend.utils import calc_split_factor, vllm_version_is - -if not vllm_version_is("0.20.2"): - from vllm.v1.worker.gpu.model_states.interface import ModelSpecificAttnMetadata +from vllm_ascend.utils import calc_split_factor _ATTENTION_MASK_BUILDER = None diff --git a/vllm_ascend/worker/v2/sample/logprob.py b/vllm_ascend/worker/v2/sample/logprob.py index 9d275fe5776..d60f3d0d08f 100644 --- a/vllm_ascend/worker/v2/sample/logprob.py +++ b/vllm_ascend/worker/v2/sample/logprob.py @@ -20,12 +20,9 @@ import torch from vllm.triton_utils import tl, triton from vllm.v1.outputs import LogprobsTensors +from vllm.v1.worker.gpu.sample.logprob import LogprobTokenIdsState from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num -from vllm_ascend.utils import vllm_version_is - -if not vllm_version_is("0.20.2"): - from vllm.v1.worker.gpu.sample.logprob import LogprobTokenIdsState @triton.jit @@ -124,7 +121,7 @@ def compute_topk_logprobs( num_logprobs: int, sampled_token_ids: torch.Tensor, cu_num_logits: list[int] | None = None, - logprob_token_ids_state: "LogprobTokenIdsState | None" = None, + logprob_token_ids_state: LogprobTokenIdsState | None = None, expanded_idx_mapping: torch.Tensor | None = None, max_per_req_token_ids: int = 0, ) -> LogprobsTensors: diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index 722ec077021..f412f51970e 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -60,6 +60,7 @@ enable_sp, get_ascend_device_type, register_ascend_customop, + vllm_version_is, ) from vllm_ascend.worker.model_runner_v1 import NPUModelRunner @@ -136,6 +137,9 @@ def __init__( WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod") self.use_v2_model_runner = envs_vllm.VLLM_USE_V2_MODEL_RUNNER + if self.use_v2_model_runner and vllm_version_is("0.20.2"): + logger.warning("VLLM_USE_V2_MODEL_RUNNER is not supported on vllm 0.20.2; falling back to v1 model runner.") + self.use_v2_model_runner = False self._pp_send_work: list[Handle] = [] ascend_compilation_config = get_ascend_config().ascend_compilation_config