diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml index 4bf3fc082ef..2030b570477 100644 --- a/.github/workflows/bot_pr_create.yaml +++ b/.github/workflows/bot_pr_create.yaml @@ -37,7 +37,7 @@ jobs: steps: - name: Get vLLM version run: | - VLLM_COMMIT=4034c3d32e30d01639459edd3ab486f56993876d + VLLM_COMMIT=5b3ba94ab4bd9da739bcc27cdd05505467fa499e echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV" - name: Checkout repository diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index 64068c64939..7f3ae9477fe 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=4034c3d32e30d01639459edd3ab486f56993876d +ARG VLLM_COMMIT=5b3ba94ab4bd9da739bcc27cdd05505467fa499e RUN git clone $VLLM_REPO /vllm-workspace/vllm && \ cd /vllm-workspace/vllm && \ git checkout $VLLM_COMMIT diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index dfa0b74bc3d..c5bc6d5c138 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0] + vllm_version: [5b3ba94ab4bd9da739bcc27cdd05505467fa499e, v0.17.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 76664e0fbb2..be0b76d5803 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: 4034c3d32e30d01639459edd3ab486f56993876d + vllm: 5b3ba94ab4bd9da739bcc27cdd05505467fa499e changes: runs-on: linux-aarch64-a2b3-0 outputs: @@ -90,7 +90,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0] + vllm_version: [5b3ba94ab4bd9da739bcc27cdd05505467fa499e, v0.17.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -102,7 +102,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0] + vllm_version: [5b3ba94ab4bd9da739bcc27cdd05505467fa499e, v0.17.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml index dd4f2c8484b..a149e744633 100644 --- a/.github/workflows/schedule_codecov_refresh.yaml +++ b/.github/workflows/schedule_codecov_refresh.yaml @@ -33,7 +33,7 @@ jobs: name: refresh codecov strategy: matrix: - vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d] + vllm_version: [5b3ba94ab4bd9da739bcc27cdd05505467fa499e] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 65bcad440b5..0b003d474af 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | 4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | +| main | 5b3ba94ab4bd9da739bcc27cdd05505467fa499e, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | ## Release cadence diff --git a/vllm_ascend/compilation/compiler_interface.py b/vllm_ascend/compilation/compiler_interface.py index 22b6f8a1952..ff7b31a4fc6 100644 --- a/vllm_ascend/compilation/compiler_interface.py +++ b/vllm_ascend/compilation/compiler_interface.py @@ -17,6 +17,7 @@ # import copy import functools +import logging from collections.abc import Callable from typing import Any @@ -33,6 +34,40 @@ from vllm_ascend.ascend_config import AscendCompilationConfig, get_ascend_config from vllm_ascend.utils import COMPILATION_PASS_KEY +logger = logging.getLogger(__name__) + + +def convert_fake_inputs_to_current_fake_mode(example_inputs: list[Any]) -> list[Any]: + """Fix for FakeTensorMode mismatch issue in vllm upgrade + + The piecewise backend now compiles ranges upfront in __init__, which may use + fake tensors from graph placeholder nodes that have a different FakeTensorMode + than the current tracing context. We need to ensure consistent fake mode. + """ + from torch._guards import detect_fake_mode + + current_fake_mode = detect_fake_mode() + if current_fake_mode is not None: + # Convert example_inputs to use the current fake mode if they are fake tensors + # from a different fake mode + converted_inputs = [] + for inp in example_inputs: + if isinstance(inp, torch.Tensor): + # Check if this is a fake tensor that needs conversion + if hasattr(inp, "fake_mode") and inp.fake_mode is not current_fake_mode: + # Convert to current fake mode + old_fake_mode = inp.fake_mode + converted_inputs.append(current_fake_mode.from_tensor(inp)) + logger.debug("Converting fake tensor from fake_mode %s to %s", old_fake_mode, current_fake_mode) + else: + converted_inputs.append(inp) + else: + converted_inputs.append(inp) + return converted_inputs + else: + logger.warning("detect_fake_mode() returned None. FakeTensorMode mismatch fix may not be applied.") + return example_inputs + def compile_fx(graph: GraphModule, example_inputs: list, inner_compile: Callable, decompositions: dict) -> Callable: recursive_compile_fx = functools.partial(compile_fx, inner_compile=inner_compile, decompositions=decompositions) @@ -49,6 +84,9 @@ def fusion_pass_compile( compile_range: Range, key: str | None = None, ) -> tuple[Callable | None, Any | None]: + # Fix for FakeTensorMode mismatch issue in vllm upgrade + example_inputs = convert_fake_inputs_to_current_fake_mode(example_inputs) + def compile_inner(graph, example_inputs): current_pass_manager = compiler_config[COMPILATION_PASS_KEY] graph = current_pass_manager(graph) @@ -101,6 +139,15 @@ def npugraph_ex_compile( npugraph_ex = torchair.get_npu_backend(compiler_config=config) + # Apply graph fusion passes (including GELU replacement) before torchair compilation + # This is needed to replace unsupported operations like aten::gelu with NPU-compatible versions + if COMPILATION_PASS_KEY in compiler_config: + current_pass_manager = compiler_config[COMPILATION_PASS_KEY] + graph = current_pass_manager(graph) + + # Fix for FakeTensorMode mismatch issue in vllm upgrade + example_inputs = convert_fake_inputs_to_current_fake_mode(example_inputs) + # torch.compile requires the output of the fx graph to be a tuple if not graph_returns_tuple(graph): return make_graph_return_tuple(graph, example_inputs, npugraph_ex), None diff --git a/vllm_ascend/compilation/graph_fusion_pass_manager.py b/vllm_ascend/compilation/graph_fusion_pass_manager.py index 40acb081219..bb500d78396 100644 --- a/vllm_ascend/compilation/graph_fusion_pass_manager.py +++ b/vllm_ascend/compilation/graph_fusion_pass_manager.py @@ -73,3 +73,11 @@ def configure(self, config: VllmConfig): from .passes.sequence_parallelism import AscendSequenceParallelismPass self.passes.append(AscendSequenceParallelismPass(config)) + + # GELU replacement pass - needed for models like Whisper that use GELU + # which is not natively supported on NPU. Uses torch_npu.npu_gelu + # which provides exact GELU computation on NPU devices. + if self.ascend_compilation_config.get("replace_gelu", True): + from .passes.gelu_replacement_pass import GeluReplacementPass + + self.passes.append(GeluReplacementPass(config)) diff --git a/vllm_ascend/compilation/passes/gelu_replacement_pass.py b/vllm_ascend/compilation/passes/gelu_replacement_pass.py new file mode 100644 index 00000000000..40ea6503721 --- /dev/null +++ b/vllm_ascend/compilation/passes/gelu_replacement_pass.py @@ -0,0 +1,210 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import annotations + +import torch +from torch._inductor.pattern_matcher import PatternMatcherPass +from vllm.config import VllmConfig +from vllm.config.compilation import Range +from vllm.logger import logger + +from vllm_ascend.compilation.passes.base_pattern import BasePattern +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.15.0"): + from vllm.compilation.vllm_inductor_pass import VllmInductorPass # type: ignore +else: + from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass + + +class GeluPattern(BasePattern): + """ + Pattern that matches torch.ops.aten.gelu.default and replaces it with + an NPU-compatible implementation using torch_npu.npu_gelu. + + This is needed because aten::gelu is not supported on NPU and falls back + to CPU, which causes errors during graph capture due to host-device + synchronization restrictions. + """ + + def __init__(self, vllm_config: VllmConfig): + super().__init__(vllm_config) + + def get_inputs(self) -> list[torch.Tensor]: + """ + Generate example inputs for the GeluPattern. + """ + x = torch.randn(2, 2048, device="npu", dtype=self.dtype) + return [x] + + def get_pattern(self): + def pattern(x: torch.Tensor): + """ + Pattern for standard GELU activation. + Note: We don't pass the approximate argument here to match + the default case where approximate="none" is used implicitly. + """ + return torch.ops.aten.gelu.default(x) + + return pattern + + def get_replacement(self): + def replacement(x: torch.Tensor): + """ + Replacement that uses NPU-compatible GELU implementation. + + Uses torch_npu.npu_gelu which provides exact GELU computation + on NPU devices, avoiding the CPU fallback. + """ + import torch_npu + + return torch_npu.npu_gelu(x) + + return replacement + + +class GeluInplacePattern(BasePattern): + """ + Pattern that matches torch.ops.aten.gelu_.default (in-place GELU) and + replaces it with an NPU-compatible implementation. + """ + + def __init__(self, vllm_config: VllmConfig): + super().__init__(vllm_config) + + def get_inputs(self) -> list[torch.Tensor]: + """ + Generate example inputs for the GeluInplacePattern. + """ + x = torch.randn(2, 2048, device="npu", dtype=self.dtype) + return [x] + + def get_pattern(self): + def pattern(x: torch.Tensor): + """ + Pattern for in-place GELU activation. + """ + return torch.ops.aten.gelu_.default(x) + + return pattern + + def get_replacement(self): + def replacement(x: torch.Tensor): + """ + Replacement that uses NPU-compatible GELU implementation. + + Since npu_gelu is not in-place, we copy the result back. + """ + import torch_npu + + out = torch_npu.npu_gelu(x) + x.copy_(out) + return x + + return replacement + + +class GeluOutPattern(BasePattern): + """ + Pattern that matches torch.ops.aten.gelu.out (out-of-place GELU with + pre-allocated output tensor) and replaces it with an NPU-compatible + implementation. + + This variant is generated by torch.compile when compiling graphs with + upfront compilation (e.g., after vllm commit 5569f5218 which stops lazy + compilation). The signature is: + aten::gelu.out(Tensor self, *, str approximate="none", Tensor(a!) out) + -> Tensor(a!) + """ + + def __init__(self, vllm_config: VllmConfig): + super().__init__(vllm_config) + + def get_inputs(self) -> list[torch.Tensor]: + """ + Generate example inputs for the GeluOutPattern. + The order must match how the function is called: gelu.out(x, out=out) + """ + x = torch.randn(2, 2048, device="npu", dtype=self.dtype) + out = torch.empty_like(x) + return [x, out] + + def get_pattern(self): + def pattern(x: torch.Tensor, out: torch.Tensor): + """ + Pattern for GELU with output tensor. + Matches: torch.ops.aten.gelu.out(x, out=out) + """ + return torch.ops.aten.gelu.out(x, out=out) + + return pattern + + def get_replacement(self): + def replacement(x: torch.Tensor, out: torch.Tensor): + """ + Replacement that uses NPU-compatible GELU implementation. + + Computes npu_gelu and copies the result to the output tensor. + """ + import torch_npu + + result = torch_npu.npu_gelu(x) + out.copy_(result) + return out + + return replacement + + +class GeluReplacementPass(VllmInductorPass): + """ + A pass that replaces aten::gelu operations with NPU-compatible + implementations to enable graph capture on Ascend NPU devices. + + Handles three variants: + - gelu.default: Standard GELU + - gelu_.default: In-place GELU + - gelu.out: GELU with pre-allocated output tensor + """ + + def __init__(self, vllm_config: VllmConfig): + super().__init__(vllm_config) + self.pattern_match_passes: PatternMatcherPass = PatternMatcherPass(pass_name="gelu_replacement_pass") + + dtype = vllm_config.model_config.dtype + if dtype not in (torch.float16, torch.bfloat16, torch.float32): + logger.debug("GELU replacement not enabled: unsupported dtype %s", dtype) + return + + # Register all GELU patterns + GeluPattern(vllm_config).register(self.pattern_match_passes) + GeluInplacePattern(vllm_config).register(self.pattern_match_passes) + GeluOutPattern(vllm_config).register(self.pattern_match_passes) + + def __call__(self, graph: torch.fx.Graph) -> None: # type: ignore[override] + self.begin() + self.matched_count = self.pattern_match_passes.apply(graph) + if self.matched_count > 0: + logger.debug("Replaced %s gelu operations with NPU-compatible version", self.matched_count) + self.end_and_log() + + def is_applicable_for_range(self, compile_range: Range) -> bool: + """ + GELU replacement is always applicable for all compile ranges. + """ + return True diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py index 4b8fc9d256f..f355c09332c 100644 --- a/vllm_ascend/patch/platform/__init__.py +++ b/vllm_ascend/patch/platform/__init__.py @@ -21,6 +21,7 @@ import vllm_ascend.patch.platform.patch_mamba_config # noqa import vllm_ascend.patch.platform.patch_minimax_m2_config # noqa import vllm_ascend.patch.platform.patch_sched_yield # noqa +import vllm_ascend.patch.platform.patch_torch_accelerator # noqa if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true": import vllm_ascend.patch.platform.patch_multiproc_executor # noqa diff --git a/vllm_ascend/patch/platform/patch_multiproc_executor.py b/vllm_ascend/patch/platform/patch_multiproc_executor.py index 50f74e60e8d..524e8db401e 100644 --- a/vllm_ascend/patch/platform/patch_multiproc_executor.py +++ b/vllm_ascend/patch/platform/patch_multiproc_executor.py @@ -1,4 +1,5 @@ -import threading +from __future__ import annotations + import weakref from collections import deque from collections.abc import Callable @@ -26,7 +27,6 @@ def _init_executor(self) -> None: # and ensure workers will be terminated. self._finalizer = weakref.finalize(self, self.shutdown) self.is_failed = False - self.shutdown_event = threading.Event() self.failure_callback: FailureCallback | None = None tensor_parallel_size, pp_parallel_size, pcp_parallel_size = self._get_parallel_sizes() @@ -66,20 +66,28 @@ def _init_executor(self) -> None: success = False try: global_start_rank = self.local_world_size * self.parallel_config.node_rank_within_dp + # When using fork, keep track of socket file descriptors that are + # inherited by the worker, so that we can close them in subsequent + # workers + inherited_fds: list[int] | None = [] if context.get_start_method() == "fork" else None + for local_rank in range(self.local_world_size): global_rank = global_start_rank + local_rank is_driver_worker = self._is_driver_worker(global_rank) - unready_workers.append( - AscendWorkerProc.make_worker_process( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=global_rank, - distributed_init_method=distributed_init_method, - input_shm_handle=scheduler_output_handle, - shared_worker_lock=shared_worker_lock, - is_driver_worker=is_driver_worker, - ) + unready_worker_handle = AscendWorkerProc.make_worker_process( + vllm_config=self.vllm_config, + local_rank=local_rank, + rank=global_rank, + distributed_init_method=distributed_init_method, + input_shm_handle=scheduler_output_handle, + shared_worker_lock=shared_worker_lock, + is_driver_worker=is_driver_worker, + inherited_fds=inherited_fds, ) + unready_workers.append(unready_worker_handle) + if inherited_fds is not None: + inherited_fds.append(unready_worker_handle.death_writer.fileno()) + inherited_fds.append(unready_worker_handle.ready_pipe.fileno()) # Workers must be created before wait_for_ready to avoid # deadlock, since worker.init_device() does a device sync. @@ -124,6 +132,7 @@ def _init_executor(self) -> None: for uw in unready_workers: if uw.death_writer is not None: uw.death_writer.close() + uw.death_writer = None self._ensure_worker_termination([uw.proc for uw in unready_workers]) self.output_rank = self._get_output_rank() @@ -158,24 +167,28 @@ def make_worker_process( input_shm_handle, # Receive SchedulerOutput shared_worker_lock: LockType, is_driver_worker: bool = False, + inherited_fds: list[int] | None = None, ) -> UnreadyWorkerProcHandle: context = get_mp_context() - # (reader, writer) - reader, writer = context.Pipe(duplex=False) - - # Create death pipe to detect parent process exit + # Ready pipe to communicate readiness from child to parent + ready_reader, ready_writer = context.Pipe(duplex=False) + # Death pipe to let child detect parent process exit death_reader, death_writer = context.Pipe(duplex=False) - + if inherited_fds is not None: + inherited_fds = inherited_fds.copy() + inherited_fds.extend((ready_reader.fileno(), death_writer.fileno())) process_kwargs = { "vllm_config": vllm_config, "local_rank": local_rank, "rank": rank, "distributed_init_method": distributed_init_method, "input_shm_handle": input_shm_handle, - "ready_pipe": (reader, writer), + "ready_pipe": ready_writer, "death_pipe": death_reader, "shared_worker_lock": shared_worker_lock, "is_driver_worker": is_driver_worker, + # Have the worker close parent end of this worker's pipes too + "inherited_fds": inherited_fds if inherited_fds is not None else [], } # Run EngineCore busy loop in background process. proc = context.Process( @@ -186,10 +199,12 @@ def make_worker_process( ) proc.start() - writer.close() + # Close child ends of pipes here in the parent + ready_writer.close() + death_reader.close() # Keep death_writer open in parent - when parent exits, # death_reader in child will get EOFError - return UnreadyWorkerProcHandle(proc, rank, reader, death_writer) + return UnreadyWorkerProcHandle(proc, rank, ready_reader, death_writer) vllm.v1.executor.multiproc_executor.MultiprocExecutor = AscendMultiprocExecutor diff --git a/vllm_ascend/patch/platform/patch_torch_accelerator.py b/vllm_ascend/patch/platform/patch_torch_accelerator.py new file mode 100644 index 00000000000..431dce4e51b --- /dev/null +++ b/vllm_ascend/patch/platform/patch_torch_accelerator.py @@ -0,0 +1,8 @@ +import torch + + +def patch_empty_cache() -> None: + torch.npu.empty_cache() + + +torch.accelerator.empty_cache = patch_empty_cache diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index aa86823fe45..f7f33125dc2 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -46,7 +46,7 @@ from vllm_ascend.compilation.acl_graph import ACLGraphWrapper, update_full_graph_params from vllm_ascend.ops.triton.spec_decode.utils import prepare_inputs_padded_kernel from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num -from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled, vllm_version_is +from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled # Currently we will fix block size to a small one since `num_reqs` can't be too large _PREPARE_INPUTS_BLOCK_SIZE = 4 @@ -663,24 +663,7 @@ def _propose( if not self.parallel_drafting: for draft_step in range(1, self.num_speculative_tokens): per_layer_attn_metadata = dict() - if vllm_version_is("0.17.0"): - for attn_group in self.draft_attn_groups: - common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm( - draft_step, - attn_metadata, - common_attn_metadata, - batch_size, - num_input_tokens, - used_update_positions, - aclgraph_runtime_mode, - ori_seq_len, - slot_indices, - mtp_slot_mapping, - attn_group=attn_group, - ) - for layer_name in self.attn_layer_names: - per_layer_attn_metadata[layer_name] = attn_metadata - else: + for attn_group in self.draft_attn_groups: common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm( draft_step, attn_metadata, @@ -692,6 +675,7 @@ def _propose( ori_seq_len, slot_indices, mtp_slot_mapping, + attn_group=attn_group, ) for layer_name in self.attn_layer_names: per_layer_attn_metadata[layer_name] = attn_metadata @@ -701,21 +685,7 @@ def _propose( if not self.parallel_drafting: for draft_step in range(1, self.num_speculative_tokens): per_layer_attn_metadata = dict() - if vllm_version_is("0.17.0"): - for attn_group in self.draft_attn_groups: - common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm( - draft_step, - attn_metadata, - common_attn_metadata, - batch_size, - num_input_tokens, - used_update_positions, - aclgraph_runtime_mode, - attn_group=attn_group, - ) - for layer_name in self.attn_layer_names: - per_layer_attn_metadata[layer_name] = attn_metadata - else: + for attn_group in self.draft_attn_groups: common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm( draft_step, attn_metadata, @@ -724,6 +694,7 @@ def _propose( num_input_tokens, used_update_positions, aclgraph_runtime_mode, + attn_group=attn_group, ) for layer_name in self.attn_layer_names: per_layer_attn_metadata[layer_name] = attn_metadata @@ -1064,16 +1035,11 @@ def set_inputs_first_pass( # 2. # Recompute the slot mapping based on the new positions and # rejection mask. - if vllm_version_is("0.17.0"): - # Use the first draft attention group's kv_cache_spec for block_size - # (all draft layers share the same kv-cache group) - assert len(self.draft_attn_groups) > 0 - block_size = self.draft_attn_groups[0].kv_cache_spec.block_size - else: - if self.attn_metadata_builder is None: - block_size = self._get_attention_metadata_builder().kv_cache_spec.block_size - else: - block_size = self.attn_metadata_builder.kv_cache_spec.block_size + # Use the first draft attention group's kv_cache_spec for block_size + # (all draft layers share the same kv-cache group) + assert len(self.draft_attn_groups) > 0 + block_size = self.draft_attn_groups[0].kv_cache_spec.block_size + new_slot_mapping = compute_new_slot_mapping( cad=cad, new_positions=self.positions[:total_num_output_tokens], @@ -1112,8 +1078,7 @@ def attn_update_stack_num_spec_norm( attn_group=None, ): assert draft_step > 0 - if vllm_version_is("0.17.0"): - assert attn_group is not None, "vllm-ascend v0.17.0rc1 requires attn_group" + assert attn_group is not None, "vllm-ascend v0.17.0rc1 requires attn_group" common_attn_metadata = self.shallow_copy_metadata(old_common_metadata) if draft_step == 1: @@ -1224,13 +1189,7 @@ def attn_update_stack_num_spec_norm( # Set the address of the attn_metadata.slot_mapping to the self.slot_mapping_group[idx] common_attn_metadata.slot_mapping = self.slot_mapping_group[draft_step] - if vllm_version_is("0.17.0"): - attn_metadata_builder = attn_group.get_metadata_builder() - else: - if self.attn_metadata_builder is None: - attn_metadata_builder = self._get_attention_metadata_builder() - else: - attn_metadata_builder = self.attn_metadata_builder + attn_metadata_builder = attn_group.get_metadata_builder() attn_metadata = attn_metadata_builder.build_for_drafting( common_attn_metadata=common_attn_metadata, diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 017ca2d32db..98542907670 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -126,7 +126,6 @@ is_moe_model, lmhead_tp_enable, set_weight_prefetch_method, - vllm_version_is, ) from vllm_ascend.worker.npu_input_batch import NPUInputBatch from vllm_ascend.worker.pcp_utils import PCPManager @@ -398,15 +397,14 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.cpu_slot_mapping = None self.sampling_done_event: torch.npu.Event | None = None - if vllm_version_is("0.17.0"): - # self.cudagraph_batch_sizes sorts in ascending order. - if ( - self.compilation_config.cudagraph_capture_sizes - and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE - ): - self.cudagraph_batch_sizes = sorted(self.compilation_config.cudagraph_capture_sizes) - else: - self.cudagraph_batch_sizes = [] + # self.cudagraph_batch_sizes sorts in ascending order. + if ( + self.compilation_config.cudagraph_capture_sizes + and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + ): + self.cudagraph_batch_sizes = sorted(self.compilation_config.cudagraph_capture_sizes) + else: + self.cudagraph_batch_sizes = [] self.mamba_state_idx: dict[str, int] = {} self._mamba_copy_bufs: mamba_utils.MambaCopyBuffers | None = None @@ -1362,7 +1360,7 @@ def execute_model( skip_compiled=has_encoder_input, ), self.maybe_get_kv_connector_output( - scheduler_output, clear_metadata=clear_kv_metadata + scheduler_output, defer_finalize=not clear_kv_metadata ) as kv_connector_output, ): hidden_states = self._model_forward( @@ -2565,14 +2563,13 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: self.may_reinitialize_input_batch(kv_cache_config) kv_caches = self.initialize_kv_cache_tensors(kv_cache_config) - if vllm_version_is("0.17.0"): - # TODO: refactor the logic of attention - # Initialize drafter attention group initialization - if self.speculative_config and ( - self.speculative_config.use_eagle() or self.speculative_config.uses_draft_model() - ): - assert isinstance(self.drafter, AscendEagleProposer | AscendDraftModelProposer) - self.drafter.initialize_attn_backend(kv_cache_config, self.kernel_block_sizes) + # TODO: refactor the logic of attention + # Initialize drafter attention group initialization + if self.speculative_config and ( + self.speculative_config.use_eagle() or self.speculative_config.uses_draft_model() + ): + assert isinstance(self.drafter, AscendEagleProposer | AscendDraftModelProposer) + self.drafter.initialize_attn_backend(kv_cache_config, self.kernel_block_sizes) if has_kv_transfer_group(): get_kv_transfer_group().register_kv_caches(kv_caches) @@ -2962,7 +2959,7 @@ def may_reinitialize_input_batch(self, kv_cache_config: KVCacheConfig) -> None: max_num_blocks.append(max_num_blocks_per_req) if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]: - assert self.cache_config.cpu_offload_gb == 0, ( + assert self.offload_config.uva.cpu_offload_gb == 0, ( "Cannot re-initialize the input batch when CPU weight " "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501 "for more details."