diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml index bb956c2a8fd..f2f878d46e2 100644 --- a/.github/workflows/bot_pr_create.yaml +++ b/.github/workflows/bot_pr_create.yaml @@ -37,7 +37,7 @@ jobs: steps: - name: Get vLLM version run: | - VLLM_COMMIT=15d76f74e2fdb12a95ea00f0ca283acf6219a2b7 + VLLM_COMMIT=6290470843c131681e3e1318ae71070a34f33225 echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV" - name: Checkout repository diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 9a40f5e344c..a585aa762cc 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [15d76f74e2fdb12a95ea00f0ca283acf6219a2b7, v0.16.0] + vllm_version: [6290470843c131681e3e1318ae71070a34f33225, v0.16.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 94393cf05fa..c5325cb2bed 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: 15d76f74e2fdb12a95ea00f0ca283acf6219a2b7 + vllm: 6290470843c131681e3e1318ae71070a34f33225 changes: runs-on: linux-aarch64-a2b3-0 outputs: @@ -87,7 +87,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [15d76f74e2fdb12a95ea00f0ca283acf6219a2b7, v0.16.0] + vllm_version: [6290470843c131681e3e1318ae71070a34f33225, v0.16.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -99,7 +99,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [15d76f74e2fdb12a95ea00f0ca283acf6219a2b7, v0.16.0] + vllm_version: [6290470843c131681e3e1318ae71070a34f33225, v0.16.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml index e693cf7bd18..4611e0db93d 100644 --- a/.github/workflows/schedule_codecov_refresh.yaml +++ b/.github/workflows/schedule_codecov_refresh.yaml @@ -33,7 +33,7 @@ jobs: name: refresh codecov strategy: matrix: - vllm_version: [15d76f74e2fdb12a95ea00f0ca283acf6219a2b7] + vllm_version: [6290470843c131681e3e1318ae71070a34f33225] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} diff --git a/vllm_ascend/core/recompute_scheduler.py b/vllm_ascend/core/recompute_scheduler.py index a33585582a7..466ed6099fc 100644 --- a/vllm_ascend/core/recompute_scheduler.py +++ b/vllm_ascend/core/recompute_scheduler.py @@ -23,7 +23,6 @@ from dataclasses import dataclass, fields import numpy as np -from vllm._bc_linter import bc_linter_include from vllm.config import SchedulerConfig, VllmConfig from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorMetadata from vllm.distributed.kv_events import KVEventBatch @@ -75,7 +74,6 @@ class RecomputeReqInfo: client_index: int = 0 -@bc_linter_include @dataclass class RecomputeSchedulerOutput(SchedulerOutput): recomputed_reqs: list[RecomputeReqInfo] | None = None diff --git a/vllm_ascend/ops/mm_encoder_attention.py b/vllm_ascend/ops/mm_encoder_attention.py index 889b88c42da..c07c6888434 100644 --- a/vllm_ascend/ops/mm_encoder_attention.py +++ b/vllm_ascend/ops/mm_encoder_attention.py @@ -97,6 +97,7 @@ def forward_oot( value: torch.Tensor, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention + sequence_lengths: torch.Tensor | None = None, ): bsz, q_len = query.size()[:2] kv_len = key.size(1) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index c0af4d00054..b206d84f873 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -355,6 +355,17 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: if get_ascend_device_type() != AscendDeviceType._310P: compilation_config.custom_ops = ["all"] + # Upstream removed automatic +rms_norm forcing for SP (PR #35410), + # but Ascend needs it because torch.ops._C.rms_norm (CUDA kernel) is + # not available. Re-force it so the custom op wrapper routes to the + # OOT (NPU) implementation. + if ( + compilation_config.pass_config.enable_sp + and "+rms_norm" not in compilation_config.custom_ops + and "-rms_norm" not in compilation_config.custom_ops + ): + compilation_config.custom_ops.append("+rms_norm") + if ascend_config.recompute_scheduler_enable: from vllm_ascend.core.recompute_scheduler import RecomputeSchedulerConfig diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index ead030c2eff..5e7e23b1863 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -119,6 +119,7 @@ is_moe_model, lmhead_tp_enable, set_weight_prefetch_method, + vllm_version_is, ) from vllm_ascend.worker.npu_input_batch import NPUInputBatch from vllm_ascend.worker.pcp_utils import PCPManager @@ -1812,16 +1813,30 @@ def _determine_batch_execution_and_padding( has_lora = len(self.input_batch.lora_id_to_lora_request) > 0 if force_has_lora is None else force_has_lora # ruff: noqa: E731 - dispatch_cudagraph = ( - lambda num_tokens, disable_full: self.cudagraph_dispatcher.dispatch( - num_tokens=num_tokens, - has_lora=has_lora, - uniform_decode=uniform_decode, - disable_full=disable_full, - ) - if not force_eager - else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded)) - ) + if vllm_version_is("0.16.0"): + + def dispatch_cudagraph(num_tokens, disable_full=False, valid_modes=None): + if force_eager: + return (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded)) + return self.cudagraph_dispatcher.dispatch( + num_tokens=num_tokens, + has_lora=has_lora, + uniform_decode=uniform_decode, + disable_full=disable_full, + ) + else: + + def dispatch_cudagraph(num_tokens, disable_full=False, valid_modes=None): + if force_eager: + return (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded)) + return self.cudagraph_dispatcher.dispatch( + num_tokens=num_tokens, + has_lora=has_lora, + uniform_decode=uniform_decode, + valid_modes=valid_modes, + invalid_modes={CUDAGraphMode.FULL} if disable_full else None, + ) + cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded, use_cascade_attn or has_encoder_output) num_tokens_padded = batch_descriptor.num_tokens if enable_sp(self.vllm_config): @@ -1842,10 +1857,16 @@ def _determine_batch_execution_and_padding( dp_rank = self.parallel_config.data_parallel_rank num_tokens_padded = int(num_tokens_across_dp[dp_rank].item()) # Re-dispatch with DP padding - cudagraph_mode, batch_descriptor = dispatch_cudagraph( - num_tokens_padded, - disable_full=synced_cudagraph_mode <= CUDAGraphMode.PIECEWISE.value, - ) + if vllm_version_is("0.16.0"): + cudagraph_mode, batch_descriptor = dispatch_cudagraph( + num_tokens_padded, + disable_full=synced_cudagraph_mode <= CUDAGraphMode.PIECEWISE.value, + ) + else: + cudagraph_mode, batch_descriptor = dispatch_cudagraph( + num_tokens_padded, + valid_modes={CUDAGraphMode(synced_cudagraph_mode)}, + ) # Assert to make sure the agreed upon token count is correct otherwise # num_tokens_across_dp will no-longer be valid assert batch_descriptor.num_tokens == num_tokens_padded diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index 1cadf05b233..f35ac135e05 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -428,7 +428,7 @@ def load_model(self) -> None: with context, set_current_vllm_config(self.vllm_config): self.model_runner.load_model() - def compile_or_warm_up_model(self) -> None: + def compile_or_warm_up_model(self) -> float: # Note: need to adapt for graph mode. warmup_sizes = (self.vllm_config.compilation_config.compile_sizes or []).copy() if not self.model_config.enforce_eager: @@ -460,6 +460,9 @@ def compile_or_warm_up_model(self) -> None: # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. set_random_seed(self.model_config.seed) + return self.vllm_config.compilation_config.compilation_time + + return self.vllm_config.compilation_config.compilation_time def _warm_up_atb(self): x = torch.rand((2, 4), dtype=torch.float16).npu()