From 2f270d37685190578cc4509776afcac805b24529 Mon Sep 17 00:00:00 2001 From: Meihan-chen Date: Mon, 2 Mar 2026 15:10:50 +0800 Subject: [PATCH 1/2] fix: adapt to upstream vLLM changes (15d76f7..6290470) Root causes: - CudagraphDispatcher.dispatch() disable_full replaced with valid_modes/invalid_modes (PR #34102) - compile_or_warm_up_model() now returns float compilation_time (PR #35503) - MMEncoderAttention forward methods added sequence_lengths param (PR #35564) - Removed auto-forcing of +rms_norm for sequence parallelism (PR #35410) Upstream commit range: 15d76f74e2fdb12a95ea00f0ca283acf6219a2b7..6290470843c131681e3e1318ae71070a34f33225 Co-Authored-By: Claude Code --- .github/workflows/bot_pr_create.yaml | 2 +- .github/workflows/pr_test_full.yaml | 2 +- .github/workflows/pr_test_light.yaml | 6 +++--- .github/workflows/schedule_codecov_refresh.yaml | 2 +- vllm_ascend/ops/mm_encoder_attention.py | 1 + vllm_ascend/platform.py | 6 ++++++ vllm_ascend/worker/model_runner_v1.py | 9 +++++---- vllm_ascend/worker/worker.py | 3 ++- 8 files changed, 20 insertions(+), 11 deletions(-) diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml index bb956c2a8fd..f2f878d46e2 100644 --- a/.github/workflows/bot_pr_create.yaml +++ b/.github/workflows/bot_pr_create.yaml @@ -37,7 +37,7 @@ jobs: steps: - name: Get vLLM version run: | - VLLM_COMMIT=15d76f74e2fdb12a95ea00f0ca283acf6219a2b7 + VLLM_COMMIT=6290470843c131681e3e1318ae71070a34f33225 echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV" - name: Checkout repository diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 9a40f5e344c..a585aa762cc 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [15d76f74e2fdb12a95ea00f0ca283acf6219a2b7, v0.16.0] + vllm_version: [6290470843c131681e3e1318ae71070a34f33225, v0.16.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 94393cf05fa..c5325cb2bed 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: 15d76f74e2fdb12a95ea00f0ca283acf6219a2b7 + vllm: 6290470843c131681e3e1318ae71070a34f33225 changes: runs-on: linux-aarch64-a2b3-0 outputs: @@ -87,7 +87,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [15d76f74e2fdb12a95ea00f0ca283acf6219a2b7, v0.16.0] + vllm_version: [6290470843c131681e3e1318ae71070a34f33225, v0.16.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -99,7 +99,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [15d76f74e2fdb12a95ea00f0ca283acf6219a2b7, v0.16.0] + vllm_version: [6290470843c131681e3e1318ae71070a34f33225, v0.16.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml index e693cf7bd18..4611e0db93d 100644 --- a/.github/workflows/schedule_codecov_refresh.yaml +++ b/.github/workflows/schedule_codecov_refresh.yaml @@ -33,7 +33,7 @@ jobs: name: refresh codecov strategy: matrix: - vllm_version: [15d76f74e2fdb12a95ea00f0ca283acf6219a2b7] + vllm_version: [6290470843c131681e3e1318ae71070a34f33225] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} diff --git a/vllm_ascend/ops/mm_encoder_attention.py b/vllm_ascend/ops/mm_encoder_attention.py index 889b88c42da..c07c6888434 100644 --- a/vllm_ascend/ops/mm_encoder_attention.py +++ b/vllm_ascend/ops/mm_encoder_attention.py @@ -97,6 +97,7 @@ def forward_oot( value: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention + sequence_lengths: torch.Tensor | None = None, ): bsz, q_len = query.size()[:2] kv_len = key.size(1) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index c0af4d00054..8d2d9d4a25a 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -355,6 +355,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: if get_ascend_device_type() != AscendDeviceType._310P: compilation_config.custom_ops = ["all"] + # Upstream removed auto-forcing of +rms_norm for SP (PR #35410). + # On Ascend, SP requires rms_norm to go through forward_oot to avoid + # calling the CUDA-only torch.ops._C.rms_norm kernel. + if enable_sp(vllm_config) and "+rms_norm" not in compilation_config.custom_ops: + compilation_config.custom_ops.append("+rms_norm") + if ascend_config.recompute_scheduler_enable: from vllm_ascend.core.recompute_scheduler import RecomputeSchedulerConfig diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index ead030c2eff..069a255e0d9 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1813,16 +1813,17 @@ def _determine_batch_execution_and_padding( # ruff: noqa: E731 dispatch_cudagraph = ( - lambda num_tokens, disable_full: self.cudagraph_dispatcher.dispatch( + lambda num_tokens, disable_full=False, valid_modes=None: self.cudagraph_dispatcher.dispatch( num_tokens=num_tokens, has_lora=has_lora, uniform_decode=uniform_decode, - disable_full=disable_full, + valid_modes={CUDAGraphMode.NONE} if force_eager else valid_modes, + invalid_modes={CUDAGraphMode.FULL} if disable_full else None, ) if not force_eager else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded)) ) - cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded, use_cascade_attn or has_encoder_output) + cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded, disable_full=use_cascade_attn or has_encoder_output) num_tokens_padded = batch_descriptor.num_tokens if enable_sp(self.vllm_config): assert batch_descriptor.num_tokens % self.vllm_config.parallel_config.tensor_parallel_size == 0, ( @@ -1844,7 +1845,7 @@ def _determine_batch_execution_and_padding( # Re-dispatch with DP padding cudagraph_mode, batch_descriptor = dispatch_cudagraph( num_tokens_padded, - disable_full=synced_cudagraph_mode <= CUDAGraphMode.PIECEWISE.value, + valid_modes={CUDAGraphMode(synced_cudagraph_mode)}, ) # Assert to make sure the agreed upon token count is correct otherwise # num_tokens_across_dp will no-longer be valid diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index 1cadf05b233..56563a586bd 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -428,7 +428,7 @@ def load_model(self) -> None: with context, set_current_vllm_config(self.vllm_config): self.model_runner.load_model() - def compile_or_warm_up_model(self) -> None: + def compile_or_warm_up_model(self) -> float: # Note: need to adapt for graph mode. warmup_sizes = (self.vllm_config.compilation_config.compile_sizes or []).copy() if not self.model_config.enforce_eager: @@ -460,6 +460,7 @@ def compile_or_warm_up_model(self) -> None: # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. set_random_seed(self.model_config.seed) + return self.vllm_config.compilation_config.compilation_time def _warm_up_atb(self): x = torch.rand((2, 4), dtype=torch.float16).npu() From a8129ff85fde85dd6ac7ded161894a6066fa757a Mon Sep 17 00:00:00 2001 From: Meihan-chen Date: Mon, 2 Mar 2026 19:21:04 +0800 Subject: [PATCH 2/2] fix: adapt to upstream vLLM changes (15d76f74e..6290470843) Root causes: - CudagraphDispatcher.dispatch() API changed: disable_full -> valid_modes/invalid_modes (#34102) - compile_or_warm_up_model() must return float compilation_time (#35503) - MMEncoderAttention.forward_oot() gained new sequence_lengths param (#34580) - +rms_norm no longer auto-forced for SP, breaks Ascend without CUDA _C ops (#35410) Upstream commit range: 15d76f74e2fdb12a95ea00f0ca283acf6219a2b7..6290470843c131681e3e1318ae71070a34f33225 Co-Authored-By: Claude Code --- vllm_ascend/core/recompute_scheduler.py | 2 - vllm_ascend/platform.py | 13 +++++-- vllm_ascend/worker/model_runner_v1.py | 52 +++++++++++++++++-------- vllm_ascend/worker/worker.py | 2 + 4 files changed, 47 insertions(+), 22 deletions(-) diff --git a/vllm_ascend/core/recompute_scheduler.py b/vllm_ascend/core/recompute_scheduler.py index a33585582a7..466ed6099fc 100644 --- a/vllm_ascend/core/recompute_scheduler.py +++ b/vllm_ascend/core/recompute_scheduler.py @@ -23,7 +23,6 @@ from dataclasses import dataclass, fields import numpy as np -from vllm._bc_linter import bc_linter_include from vllm.config import SchedulerConfig, VllmConfig from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorMetadata from vllm.distributed.kv_events import KVEventBatch @@ -75,7 +74,6 @@ class RecomputeReqInfo: client_index: int = 0 -@bc_linter_include @dataclass class RecomputeSchedulerOutput(SchedulerOutput): recomputed_reqs: list[RecomputeReqInfo] | None = None diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 8d2d9d4a25a..b206d84f873 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -355,10 +355,15 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: if get_ascend_device_type() != AscendDeviceType._310P: compilation_config.custom_ops = ["all"] - # Upstream removed auto-forcing of +rms_norm for SP (PR #35410). - # On Ascend, SP requires rms_norm to go through forward_oot to avoid - # calling the CUDA-only torch.ops._C.rms_norm kernel. - if enable_sp(vllm_config) and "+rms_norm" not in compilation_config.custom_ops: + # Upstream removed automatic +rms_norm forcing for SP (PR #35410), + # but Ascend needs it because torch.ops._C.rms_norm (CUDA kernel) is + # not available. Re-force it so the custom op wrapper routes to the + # OOT (NPU) implementation. + if ( + compilation_config.pass_config.enable_sp + and "+rms_norm" not in compilation_config.custom_ops + and "-rms_norm" not in compilation_config.custom_ops + ): compilation_config.custom_ops.append("+rms_norm") if ascend_config.recompute_scheduler_enable: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 069a255e0d9..5e7e23b1863 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -119,6 +119,7 @@ is_moe_model, lmhead_tp_enable, set_weight_prefetch_method, + vllm_version_is, ) from vllm_ascend.worker.npu_input_batch import NPUInputBatch from vllm_ascend.worker.pcp_utils import PCPManager @@ -1812,18 +1813,31 @@ def _determine_batch_execution_and_padding( has_lora = len(self.input_batch.lora_id_to_lora_request) > 0 if force_has_lora is None else force_has_lora # ruff: noqa: E731 - dispatch_cudagraph = ( - lambda num_tokens, disable_full=False, valid_modes=None: self.cudagraph_dispatcher.dispatch( - num_tokens=num_tokens, - has_lora=has_lora, - uniform_decode=uniform_decode, - valid_modes={CUDAGraphMode.NONE} if force_eager else valid_modes, - invalid_modes={CUDAGraphMode.FULL} if disable_full else None, - ) - if not force_eager - else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded)) - ) - cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded, disable_full=use_cascade_attn or has_encoder_output) + if vllm_version_is("0.16.0"): + + def dispatch_cudagraph(num_tokens, disable_full=False, valid_modes=None): + if force_eager: + return (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded)) + return self.cudagraph_dispatcher.dispatch( + num_tokens=num_tokens, + has_lora=has_lora, + uniform_decode=uniform_decode, + disable_full=disable_full, + ) + else: + + def dispatch_cudagraph(num_tokens, disable_full=False, valid_modes=None): + if force_eager: + return (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded)) + return self.cudagraph_dispatcher.dispatch( + num_tokens=num_tokens, + has_lora=has_lora, + uniform_decode=uniform_decode, + valid_modes=valid_modes, + invalid_modes={CUDAGraphMode.FULL} if disable_full else None, + ) + + cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded, use_cascade_attn or has_encoder_output) num_tokens_padded = batch_descriptor.num_tokens if enable_sp(self.vllm_config): assert batch_descriptor.num_tokens % self.vllm_config.parallel_config.tensor_parallel_size == 0, ( @@ -1843,10 +1857,16 @@ def _determine_batch_execution_and_padding( dp_rank = self.parallel_config.data_parallel_rank num_tokens_padded = int(num_tokens_across_dp[dp_rank].item()) # Re-dispatch with DP padding - cudagraph_mode, batch_descriptor = dispatch_cudagraph( - num_tokens_padded, - valid_modes={CUDAGraphMode(synced_cudagraph_mode)}, - ) + if vllm_version_is("0.16.0"): + cudagraph_mode, batch_descriptor = dispatch_cudagraph( + num_tokens_padded, + disable_full=synced_cudagraph_mode <= CUDAGraphMode.PIECEWISE.value, + ) + else: + cudagraph_mode, batch_descriptor = dispatch_cudagraph( + num_tokens_padded, + valid_modes={CUDAGraphMode(synced_cudagraph_mode)}, + ) # Assert to make sure the agreed upon token count is correct otherwise # num_tokens_across_dp will no-longer be valid assert batch_descriptor.num_tokens == num_tokens_padded diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index 56563a586bd..f35ac135e05 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -462,6 +462,8 @@ def compile_or_warm_up_model(self) -> float: set_random_seed(self.model_config.seed) return self.vllm_config.compilation_config.compilation_time + return self.vllm_config.compilation_config.compilation_time + def _warm_up_atb(self): x = torch.rand((2, 4), dtype=torch.float16).npu() weight = torch.rand((2, 4), dtype=torch.float16).npu()