diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index 1fe73f355a2..45f877098b7 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -37,7 +37,7 @@ on: # Current supported vLLM versions options: - main - - v0.9.2 + - v0.10.0 - v0.9.1 - v0.7.3 vllm-ascend-version: @@ -163,7 +163,7 @@ jobs: repository: vllm-project/vllm path: ./vllm-empty # Please also update this when bump matched version - ref: ${{ github.event.inputs.vllm-version || 'v0.9.2' }} + ref: ${{ github.event.inputs.vllm-version || 'v0.10.0' }} - name: Install vllm-project/vllm from source working-directory: ./vllm-empty diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index 9598097576a..d25ec5bcbb2 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -51,7 +51,7 @@ jobs: strategy: matrix: include: - - vllm_branch: v0.9.2 + - vllm_branch: v0.10.0 vllm_ascend_branch: main vllm_use_v1: 1 max-parallel: 1 diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 5a361a3bf10..e08b80a50f9 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -81,7 +81,7 @@ jobs: VLLM_USE_MODELSCOPE: True strategy: matrix: - vllm_version: [main, v0.9.2] + vllm_version: [main, v0.10.0] steps: - name: Install packages run: | @@ -137,7 +137,7 @@ jobs: max-parallel: 2 matrix: os: [linux-arm64-npu-1] - vllm_version: [main, v0.9.2] + vllm_version: [main, v0.10.0] name: singlecard e2e test runs-on: ${{ matrix.os }} container: @@ -216,7 +216,7 @@ jobs: max-parallel: 1 matrix: os: [linux-arm64-npu-4] - vllm_version: [main, v0.9.2] + vllm_version: [main, v0.10.0] name: multicard e2e test runs-on: ${{ matrix.os }} container: diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml index 959e9c2ee86..02070684c0e 100644 --- a/.github/workflows/vllm_ascend_test_long_term.yaml +++ b/.github/workflows/vllm_ascend_test_long_term.yaml @@ -43,7 +43,7 @@ jobs: max-parallel: 2 matrix: os: [linux-arm64-npu-1, linux-arm64-npu-4] - vllm_version: [main, v0.9.2] + vllm_version: [main, v0.10.0] name: vLLM Ascend long term test runs-on: ${{ matrix.os }} container: diff --git a/Dockerfile b/Dockerfile index f839568257a..cd135dc82a9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.9.2 +ARG VLLM_TAG=v0.10.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.310p b/Dockerfile.310p index 4342ce1f4d4..675e716c3c1 100644 --- a/Dockerfile.310p +++ b/Dockerfile.310p @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.9.2 +ARG VLLM_TAG=v0.10.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler index 7d4b23cf6b6..f9f5ac6600a 100644 --- a/Dockerfile.310p.openEuler +++ b/Dockerfile.310p.openEuler @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.9.2 +ARG VLLM_TAG=v0.10.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. diff --git a/Dockerfile.a3 b/Dockerfile.a3 index 8deae5790c9..ec4ef48f187 100644 --- a/Dockerfile.a3 +++ b/Dockerfile.a3 @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.9.2 +ARG VLLM_TAG=v0.10.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler index f0a1001260a..3beaece2d45 100644 --- a/Dockerfile.a3.openEuler +++ b/Dockerfile.a3.openEuler @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.9.2 +ARG VLLM_TAG=v0.10.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index eba0dcb3665..3603b746060 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.9.2 +ARG VLLM_TAG=v0.10.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. diff --git a/docs/source/conf.py b/docs/source/conf.py index e354ef99375..f304cb6acb9 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -77,7 +77,7 @@ # CANN image tag 'cann_image_tag': "8.1.rc1-910b-ubuntu22.04-py3.10", # vllm version in ci - 'ci_vllm_version': 'v0.9.2', + 'ci_vllm_version': 'v0.10.0', } # Add any paths that contain templates here, relative to this directory. diff --git a/docs/source/developer_guide/feature_guide/patch.md b/docs/source/developer_guide/feature_guide/patch.md index 26843aed8e4..19bb2885a0e 100644 --- a/docs/source/developer_guide/feature_guide/patch.md +++ b/docs/source/developer_guide/feature_guide/patch.md @@ -38,15 +38,15 @@ vllm_ascend In both **platform** and **worker** folder, there are several patch modules. They are used for patching different version of vLLM. -- `patch_0_9_2`: This module is used for patching vLLM 0.9.2. The version is always the nearest version of vLLM. Once vLLM is released, we will drop this patch module and bump to a new version. For example, `patch_0_9_2` is used for patching vLLM 0.9.2. +- `patch_0_10_0`: This module is used for patching vLLM 0.10.0. The version is always the nearest version of vLLM. Once vLLM is released, we will drop this patch module and bump to a new version. For example, `patch_0_10_0` is used for patching vLLM 0.10.0. - `patch_main`: This module is used for patching the code in vLLM main branch. -- `patch_common`: This module is used for patching both vLLM 0.9.2 and vLLM main branch. +- `patch_common`: This module is used for patching both vLLM 0.10.0 and vLLM main branch. ## How to write a patch Before writing a patch, following the principle above, we should patch the least code. If it's necessary, we can patch the code in either **platform** and **worker** folder. Here is an example to patch `distributed` module in vLLM. -1. Decide which version of vLLM we should patch. For example, after analysis, here we want to patch both 0.9.2 and main of vLLM. +1. Decide which version of vLLM we should patch. For example, after analysis, here we want to patch both 0.10.0 and main of vLLM. 2. Decide which process we should patch. For example, here `distributed` belongs to the vLLM main process, so we should patch `platform`. 3. Create the patch file in the right folder. The file should be named as `patch_{module_name}.py`. The example here is `vllm_ascend/patch/platform/patch_common/patch_distributed.py`. 4. Write your patch code in the new file. Here is an example: @@ -82,4 +82,4 @@ Before writing a patch, following the principle above, we should patch the least ## Limitation 1. In V1 Engine, vLLM starts three kinds of process: Main process, EngineCore process and Worker process. Now vLLM Ascend only support patch the code in Main process and Worker process by default. If you want to patch the code runs in EngineCore process, you should patch EngineCore process entirely during setup, the entry code is here `vllm.v1.engine.core`. Please override `EngineCoreProc` and `DPEngineCoreProc` entirely. -2. If you are running an edited vLLM code, the version of the vLLM may be changed automatically. For example, if you runs an edited vLLM based on v0.9.n, the version of vLLM may be change to v0.9.nxxx, in this case, the patch for v0.9.n in vLLM Ascend would not work as expect, because that vLLM Ascend can't distinguish the version of vLLM you're using. In this case, you can set the environment variable `VLLM_VERSION` to specify the version of vLLM you're using, then the patch for v0.9.2 should work. +2. If you are running an edited vLLM code, the version of the vLLM may be changed automatically. For example, if you runs an edited vLLM based on v0.9.n, the version of vLLM may be change to v0.9.nxxx, in this case, the patch for v0.9.n in vLLM Ascend would not work as expect, because that vLLM Ascend can't distinguish the version of vLLM you're using. In this case, you can set the environment variable `VLLM_VERSION` to specify the version of vLLM you're using, then the patch for v0.10.0 should work. diff --git a/pyproject.toml b/pyproject.toml index 89783f0fdd0..390d8c4dfb1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,8 @@ requires = [ "msgpack", "quart", "numba", + # Remove after https://github.com/vllm-project/vllm-ascend/issues/2034 + "transformers<4.54.0", ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index 375b55433e2..c2b2a3175eb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,8 @@ setuptools-scm>=8 torch>=2.5.1 torchvision<0.21.0 wheel +# Remove after https://github.com/vllm-project/vllm-ascend/issues/2034 +transformers<4.54.0 # requirements for disaggregated prefill msgpack diff --git a/tests/e2e/singlecard/test_offline_inference.py b/tests/e2e/singlecard/test_offline_inference.py index 400fe80d56d..e5b9364de1b 100644 --- a/tests/e2e/singlecard/test_offline_inference.py +++ b/tests/e2e/singlecard/test_offline_inference.py @@ -127,3 +127,19 @@ def test_models_topk() -> None: enforce_eager=True, gpu_memory_utilization=0.7) as vllm_model: vllm_model.generate(example_prompts, sampling_params) + + +def test_models_prompt_logprobs() -> None: + + example_prompts = [ + "Hello, my name is", + ] + + with VllmRunner("Qwen/Qwen2.5-0.5B-Instruct", + max_model_len=8192, + dtype="float16", + enforce_eager=True, + gpu_memory_utilization=0.7) as vllm_model: + vllm_model.generate_greedy_logprobs(example_prompts, + max_tokens=5, + num_logprobs=1) diff --git a/tests/ut/attention/test_attention_v1.py b/tests/ut/attention/test_attention_v1.py index 735462fd2e9..2838a145edc 100644 --- a/tests/ut/attention/test_attention_v1.py +++ b/tests/ut/attention/test_attention_v1.py @@ -3,15 +3,12 @@ import torch from tests.ut.base import TestBase -from vllm_ascend.attention.attention_v1 import \ - AscendAttentionBackendImpl092 # isort: skip from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend, AscendAttentionBackendImpl, AscendAttentionMetadataBuilder, AscendAttentionState, AscendMetadata, CommonAttentionState) -from vllm_ascend.utils import vllm_version_is class TestAscendAttentionBackend(TestBase): @@ -20,12 +17,8 @@ def test_get_name(self): self.assertEqual(AscendAttentionBackend.get_name(), "ASCEND") def test_get_impl_cls(self): - if vllm_version_is("0.9.2"): - self.assertEqual(AscendAttentionBackend.get_impl_cls(), - AscendAttentionBackendImpl092) - else: - self.assertEqual(AscendAttentionBackend.get_impl_cls(), - AscendAttentionBackendImpl) + self.assertEqual(AscendAttentionBackend.get_impl_cls(), + AscendAttentionBackendImpl) def test_get_metadata_cls(self): self.assertEqual(AscendAttentionBackend.get_metadata_cls(), diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index 15515653a1b..12fdf574039 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -17,7 +17,7 @@ from dataclasses import dataclass from enum import Enum -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import List, Optional, Tuple, Type import torch import torch_npu @@ -31,7 +31,7 @@ from vllm_ascend.ops.attention import vanilla_chunked_prefill from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p, - nd_to_nz_2d, nd_to_nz_spec, vllm_version_is) + nd_to_nz_2d, nd_to_nz_spec) class AscendAttentionBackend(AttentionBackend): @@ -43,8 +43,6 @@ def get_name() -> str: @staticmethod def get_impl_cls() -> Type["AscendAttentionBackendImpl"]: - if vllm_version_is("0.9.2"): - return AscendAttentionBackendImpl092 return AscendAttentionBackendImpl @staticmethod @@ -440,38 +438,6 @@ def forward( return output.view(num_tokens, self.hidden_size) -class AscendAttentionBackendImpl092(AscendAttentionBackendImpl): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - ) -> None: - super().__init__( - num_heads=num_heads, - head_size=head_size, - scale=scale, - num_kv_heads=num_kv_heads, - alibi_slopes=alibi_slopes, - sliding_window=sliding_window, - kv_cache_dtype=kv_cache_dtype, - logits_soft_cap=logits_soft_cap, - attn_type=attn_type, - kv_sharing_target_layer_name=kv_sharing_target_layer_name, - use_irope=use_irope, - ) - - def unified_ascend_attention_with_output( query: torch.Tensor, key: torch.Tensor, diff --git a/vllm_ascend/attention/attention_v1_torchair.py b/vllm_ascend/attention/attention_v1_torchair.py index 84cfbd0f026..5beb78dd8d4 100644 --- a/vllm_ascend/attention/attention_v1_torchair.py +++ b/vllm_ascend/attention/attention_v1_torchair.py @@ -16,7 +16,7 @@ # from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import List, Optional, Tuple, Type import numpy as np import torch @@ -29,7 +29,7 @@ from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p, - nd_to_nz_2d, vllm_version_is) + nd_to_nz_2d) class AscendAttentionTorchairBackend(AttentionBackend): @@ -41,8 +41,6 @@ def get_name() -> str: @staticmethod def get_impl_cls() -> Type["AscendAttentionTorchairBackendImpl"]: - if vllm_version_is("0.9.2"): - return AscendAttentionTorchairBackendImpl092 return AscendAttentionTorchairBackendImpl @staticmethod @@ -489,36 +487,3 @@ def forward( "to use ascend scheduler.") return output.view(num_tokens, self.hidden_size) - - -class AscendAttentionTorchairBackendImpl092(AscendAttentionTorchairBackendImpl - ): - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - ) -> None: - super().__init__( - num_heads=num_heads, - head_size=head_size, - scale=scale, - num_kv_heads=num_kv_heads, - alibi_slopes=alibi_slopes, - sliding_window=sliding_window, - kv_cache_dtype=kv_cache_dtype, - logits_soft_cap=logits_soft_cap, - attn_type=attn_type, - kv_sharing_target_layer_name=kv_sharing_target_layer_name, - use_irope=use_irope, - ) diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 0645a7887bb..78b89bb8c83 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -1,12 +1,11 @@ from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, - TypeVar) +from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar import numpy as np import torch import torch_npu from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer, - AttentionMetadata, AttentionType, + AttentionMetadata, MLAAttentionImpl) from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.config import get_current_vllm_config @@ -22,7 +21,7 @@ from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor -from vllm_ascend.utils import npu_prefetch, vllm_version_is +from vllm_ascend.utils import npu_prefetch from vllm_ascend.worker.npu_input_batch import InputBatch if TYPE_CHECKING: @@ -54,8 +53,6 @@ def get_kv_cache_shape(num_blocks: int, block_size: int, num_kv_heads: int, @staticmethod def get_impl_cls() -> Type["MLAAttentionImpl"]: - if vllm_version_is("0.9.2"): - return AscendMLAImpl092 return AscendMLAImpl @@ -1212,34 +1209,3 @@ def forward( output[:num_decode_tokens] = output_decode return output_padded - - -class AscendMLAImpl092(AscendMLAImpl): - - def __init__(self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, - **kwargs) -> None: - super().__init__( - num_heads=num_heads, - head_size=head_size, - scale=scale, - num_kv_heads=num_kv_heads, - alibi_slopes=alibi_slopes, - sliding_window=sliding_window, - kv_cache_dtype=kv_cache_dtype, - logits_soft_cap=logits_soft_cap, - attn_type=attn_type, - kv_sharing_target_layer_name=kv_sharing_target_layer_name, - use_irope=use_irope, - **kwargs) diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py index d7e84fc813a..667328dddb5 100644 --- a/vllm_ascend/core/scheduler.py +++ b/vllm_ascend/core/scheduler.py @@ -32,8 +32,6 @@ from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager -from vllm_ascend.utils import vllm_version_is - class AscendScheduler(Scheduler): """This Scheduler extends vllm's original v1 scheduler @@ -283,23 +281,12 @@ def skip_cur_request(): # allow the lower-priority requests to be scheduled. req_index += 1 continue - if vllm_version_is("0.9.2"): - num_draft_tokens = max( - num_new_tokens + request.num_computed_tokens - - request.num_tokens, 0) while True: - if vllm_version_is("0.9.2"): - new_blocks = self.kv_cache_manager.allocate_slots( - request, - num_new_tokens, - num_draft_tokens=num_draft_tokens, - num_lookahead_tokens=self.num_lookahead_tokens) - else: - new_blocks = self.kv_cache_manager.allocate_slots( - request, - num_new_tokens, - num_lookahead_tokens=self.num_lookahead_tokens) + new_blocks = self.kv_cache_manager.allocate_slots( + request, + num_new_tokens, + num_lookahead_tokens=self.num_lookahead_tokens) if new_blocks is None: # The request cannot be scheduled. # Preempt the lowest-priority request. diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index a64eb0f0e9d..61d92ea6124 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -24,9 +24,9 @@ # each worker's `__init__` function. # # Then in each kind of patch, there are three folders: -# - patch_0_9_2: contains the patches applied when vllm version is 0.9.2. +# - patch_0_10_0: contains the patches applied when vllm version is 0.10.0. # - patch_main: contains the patches applied when vllm version is main branch. -# - patch_common: contains the patches applied in both 0.9.2 and main branch. +# - patch_common: contains the patches applied in both 0.10.0 and main branch. # # Once a new patch is added in vllm-ascend, please add the patch description into this file as well. # ---------------------------------------------------------------------------------- @@ -101,3 +101,16 @@ # - https://github.com/vllm-project/vllm-ascend/pull/1732 # Future Plan: # Revert it when the ascend scatter performance improves. +# +# ** File: worker/patch_common/patch_sampler.py ** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.v1.sample.sampler.Sampler.gather_logprobs` +# Why: +# We need to patch gather_logprobs to make sure call batched_count_greater_than +# with backend=current_platform.simple_compile_backend +# How: +# Patch gather_logprobs call new batched_count_greater_than +# Related PR (if no, explain why): +# - https://github.com/vllm-project/vllm/pull/21591 +# Future Plan: +# Revert it when vLLM merge #21591 and release new version diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py index 4e0fed578bb..c6512f5fb3a 100644 --- a/vllm_ascend/patch/platform/__init__.py +++ b/vllm_ascend/patch/platform/__init__.py @@ -17,8 +17,8 @@ from vllm_ascend.utils import vllm_version_is # Import specific patches for different versions -if vllm_version_is("0.9.2"): - from vllm_ascend.patch.platform import patch_0_9_2 # noqa: F401 +if vllm_version_is("0.10.0"): + from vllm_ascend.patch.platform import patch_0_10_0 # noqa: F401 from vllm_ascend.patch.platform import patch_common # noqa: F401 else: from vllm_ascend.patch.platform import patch_common # noqa: F401 diff --git a/vllm_ascend/patch/platform/patch_0_9_2/__init__.py b/vllm_ascend/patch/platform/patch_0_10_0/__init__.py similarity index 100% rename from vllm_ascend/patch/platform/patch_0_9_2/__init__.py rename to vllm_ascend/patch/platform/patch_0_10_0/__init__.py diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index de7219ad2e1..a3e572b0e69 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -18,8 +18,8 @@ from vllm_ascend.utils import vllm_version_is # Import specific patches for different versions -if vllm_version_is("0.9.2"): - from vllm_ascend.patch.worker import patch_0_9_2 # noqa: F401 +if vllm_version_is("0.10.0"): + from vllm_ascend.patch.worker import patch_0_10_0 # noqa: F401 from vllm_ascend.patch.worker import patch_common # noqa: F401 else: from vllm_ascend.patch.worker import patch_common # noqa: F401 diff --git a/vllm_ascend/patch/worker/patch_0_9_2/__init__.py b/vllm_ascend/patch/worker/patch_0_10_0/__init__.py similarity index 88% rename from vllm_ascend/patch/worker/patch_0_9_2/__init__.py rename to vllm_ascend/patch/worker/patch_0_10_0/__init__.py index 116c73c06c8..d95e2e302dc 100644 --- a/vllm_ascend/patch/worker/patch_0_9_2/__init__.py +++ b/vllm_ascend/patch/worker/patch_0_10_0/__init__.py @@ -14,3 +14,5 @@ # See the License for the specific language governing permissions and # limitations under the License. # + +import vllm_ascend.patch.worker.patch_0_10_0.patch_sampler_gather_logprobs # noqa diff --git a/vllm_ascend/patch/worker/patch_0_10_0/patch_sampler_gather_logprobs.py b/vllm_ascend/patch/worker/patch_0_10_0/patch_sampler_gather_logprobs.py new file mode 100644 index 00000000000..1e6b44ea8b8 --- /dev/null +++ b/vllm_ascend/patch/worker/patch_0_10_0/patch_sampler_gather_logprobs.py @@ -0,0 +1,87 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +from vllm.platforms import current_platform +from vllm.v1.outputs import LogprobsTensors +from vllm.v1.sample.sampler import Sampler + + +@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) +def batched_count_greater_than(x: torch.Tensor, + values: torch.Tensor) -> torch.Tensor: + """ + Counts elements in each row of x that are greater than the corresponding + value in values. Use torch.compile to generate an optimized kernel for + this function. otherwise, it will create additional copies of the input + tensors and cause memory issues. + Args: + x (torch.Tensor): A 2D tensor of shape (batch_size, n_elements). + values (torch.Tensor): A 2D tensor of shape (batch_size, 1). + Returns: + torch.Tensor: A 1D tensor of shape (batch_size,) with the counts. + """ + return (x >= values).sum(-1) + + +def gather_logprobs( + self, + logprobs: torch.Tensor, + num_logprobs: int, + token_ids: torch.Tensor, +) -> LogprobsTensors: + """ + Gather logprobs for topk and sampled/prompt token. + + Args: + logprobs: (num tokens) x (vocab) tensor + num_logprobs: minimum number of logprobs to + retain per token + token_ids: prompt tokens (if prompt logprobs) + or sampled tokens (if sampled + logprobs); 1D token ID tensor + with (num tokens) elements + Must be int64. + + Returns: + Top-k int indices tensor, (num tokens) x (num_logprobs + 1) + Top-k float logprobs tensor, (num tokens) x (num_logprobs + 1) + Sampled token rank tensor, (num tokens) + """ + assert token_ids.dtype == torch.int64 + # Find the topK values. + topk_logprobs, topk_indices = torch.topk(logprobs, num_logprobs, dim=-1) + + # Get with the logprob of the prompt or sampled token. + token_ids = token_ids.unsqueeze(-1) + token_logprobs = logprobs.gather(-1, token_ids) + + # Compute the ranks of the actual token. + token_ranks = batched_count_greater_than(logprobs, token_logprobs) + + # Concatenate together with the topk. + indices = torch.cat((token_ids, topk_indices), dim=1) + logprobs = torch.cat((token_logprobs, topk_logprobs), dim=1) + + # Use int32 to reduce the tensor size. + indices = indices.to(torch.int32) + + return LogprobsTensors(indices, logprobs, token_ranks) + + +Sampler.gather_logprobs = gather_logprobs diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 4d2498750ee..128ef79a49d 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -45,8 +45,9 @@ from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import get_model -from vllm.model_executor.models.interfaces_base import (VllmModelForPooling, - is_pooling_model) +from vllm.model_executor.models.interfaces import supports_transcription +from vllm.model_executor.models.interfaces_base import ( + VllmModelForPooling, is_pooling_model, is_text_generation_model) from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.multimodal.utils import group_mm_inputs_by_modality @@ -66,7 +67,7 @@ from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.spec_decode.ngram_proposer import NgramProposer from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin -from vllm.v1.worker.utils import (gather_mm_placeholders, +from vllm.v1.worker.utils import (bind_kv_cache, gather_mm_placeholders, sanity_check_mm_encoder_outputs, scatter_mm_placeholders) @@ -88,15 +89,8 @@ from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch -if vllm_version_is("0.9.2"): - from vllm.model_executor.models.interfaces import has_step_pooler - from vllm.v1.utils import bind_kv_cache -else: - from vllm.model_executor.models.interfaces import supports_transcription - from vllm.model_executor.models.interfaces_base import \ - is_text_generation_model +if not vllm_version_is("0.10.0"): from vllm.tasks import GenerationTask, SupportedTask - from vllm.v1.worker.utils import bind_kv_cache if TYPE_CHECKING: import xgrammar as xgr # type: ignore[import-untyped] @@ -409,7 +403,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: else: generator = None - if not vllm_version_is("0.9.2") and pooling_params: + if pooling_params: assert (task := pooling_params.task) is not None, ( "You did not set `task` in the API") model = cast(VllmModelForPooling, self.model) @@ -585,10 +579,7 @@ def get_eagle_atten_dict( # OPTIMIZATION: Start copying the block table first. # This way, we can overlap the copy with the following CPU operations. - if vllm_version_is("0.9.2"): - self.input_batch.block_table.commit(num_reqs) - else: - self.input_batch.block_table.commit_block_table(num_reqs) + self.input_batch.block_table.commit_block_table(num_reqs) # Get the number of scheduled tokens for each request. req_ids = self.input_batch.req_ids @@ -939,10 +930,7 @@ def _process_reqs( # OPTIMIZATION: Start copying the block table first. # This way, we can overlap the copy with the following CPU operations. - if vllm_version_is("0.9.2"): - self.input_batch.block_table.commit(num_reqs) - else: - self.input_batch.block_table.commit_block_table(num_reqs) + self.input_batch.block_table.commit_block_table(num_reqs) # Get the number of scheduled tokens for each request. # TODO: The Python loop can be slow. Optimize. @@ -1771,57 +1759,33 @@ def _dummy_pooler_run( req_num_tokens = num_tokens // num_reqs - if vllm_version_is("0.9.2"): - dummy_metadata = PoolingMetadata( - prompt_lens=torch.tensor( - [h.shape[0] for h in hidden_states_list], - device=self.device), - prompt_token_ids=torch.zeros((num_reqs, req_num_tokens), - dtype=torch.int32, - device=self.device), - pooling_params=[PoolingParams()] * num_reqs) - try: - pooler_output = self.model.pooler( - hidden_states=hidden_states_list, - pooling_metadata=dummy_metadata) - except RuntimeError as e: - if 'out of memory' in str(e): - raise RuntimeError( - "NPU out of memory occurred when warming up pooler with " - f"{num_reqs} dummy requests. Please try lowering " - "`max_num_seqs` or `gpu_memory_utilization` when " - "initializing the engine.") from e - else: - raise e - else: - model = cast(VllmModelForPooling, self.model) - dummy_task = self.get_supported_pooling_tasks()[0] - dummy_pooling_params = PoolingParams(task=dummy_task) - - to_update = model.pooler.get_pooling_updates(dummy_task) - to_update.apply(dummy_pooling_params) - - dummy_metadata = PoolingMetadata( - prompt_lens=torch.tensor( - [h.shape[0] for h in hidden_states_list], - device=self.device), - prompt_token_ids=torch.zeros((num_reqs, req_num_tokens), - dtype=torch.int32, - device=self.device), - pooling_params=[dummy_pooling_params] * num_reqs) - - try: - pooler_output = model.pooler(hidden_states=hidden_states_list, - pooling_metadata=dummy_metadata) - except RuntimeError as e: - if 'out of memory' in str(e): - raise RuntimeError( - "NPU out of memory occurred when warming up pooler with " - f"{num_reqs} dummy requests. Please try lowering " - "`max_num_seqs` or `gpu_memory_utilization` when " - "initializing the engine.") from e - else: - raise e + model = cast(VllmModelForPooling, self.model) + dummy_task = self.get_supported_pooling_tasks()[0] + dummy_pooling_params = PoolingParams(task=dummy_task) + + to_update = model.pooler.get_pooling_updates(dummy_task) + to_update.apply(dummy_pooling_params) + + dummy_metadata = PoolingMetadata( + prompt_lens=torch.tensor([h.shape[0] for h in hidden_states_list], + device=self.device), + prompt_token_ids=torch.zeros((num_reqs, req_num_tokens), + dtype=torch.int32, + device=self.device), + pooling_params=[dummy_pooling_params] * num_reqs) + + try: + pooler_output = model.pooler(hidden_states=hidden_states_list, + pooling_metadata=dummy_metadata) + except RuntimeError as e: + if 'out of memory' in str(e): + raise RuntimeError( + "NPU out of memory occurred when warming up pooler with " + f"{num_reqs} dummy requests. Please try lowering " + "`max_num_seqs` or `gpu_memory_utilization` when " + "initializing the engine.") from e + else: + raise e return pooler_output @@ -1841,9 +1805,6 @@ def load_model(self) -> None: QKVParallelLinear, RowParallelLinear)): module.weight.data = torch_npu.npu_format_cast( module.weight.data, ACL_FORMAT_FRACTAL_NZ) - - if vllm_version_is("0.9.2") and has_step_pooler(self.model): - self.input_batch.logits_processing_needs_token_ids_bool = True if self.drafter: logger.info("Loading drafter model...") if isinstance(self.drafter, EagleProposer): diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py index 5a695af65d4..d0acd04cd06 100644 --- a/vllm_ascend/worker/npu_input_batch.py +++ b/vllm_ascend/worker/npu_input_batch.py @@ -35,8 +35,6 @@ from vllm.v1.utils import copy_slice from vllm.v1.worker.block_table import MultiGroupBlockTable -from vllm_ascend.utils import vllm_version_is - _SAMPLING_EPS = 1e-5 @@ -246,11 +244,8 @@ def __init__( # req_index -> bad_words_token_ids self.bad_words_token_ids: dict[int, list[list[int]]] = {} - if vllm_version_is("0.9.2"): - self.logits_processing_needs_token_ids_bool = False - else: - self.logits_processing_needs_token_ids = np.zeros(max_num_reqs, - dtype=bool) + self.logits_processing_needs_token_ids = np.zeros(max_num_reqs, + dtype=bool) self.req_output_token_ids: list[Optional[list[int]]] = [] @@ -387,9 +382,6 @@ def add_request( if sampling_params.bad_words_token_ids: self.bad_words_token_ids[ req_index] = sampling_params.bad_words_token_ids - elif vllm_version_is("0.9.2"): - assert request.pooling_params is not None - self.pooling_params[req_id] = request.pooling_params elif pooling_params := request.pooling_params: self.pooling_params[req_id] = pooling_params self.logits_processing_needs_token_ids[req_index] = ( @@ -624,15 +616,10 @@ def _make_sampling_metadata(self) -> SamplingMetadata: self.presence_penalties, num_reqs) copy_slice(self.repetition_penalties_cpu_tensor, self.repetition_penalties, num_reqs) - if vllm_version_is("0.9.2"): - needs_prompt_token_ids = ( - not self.no_penalties - or (self.num_reqs > 0 - and self.logits_processing_needs_token_ids_bool)) - else: - needs_prompt_token_ids = ( - not self.no_penalties - or self.logits_processing_needs_token_ids[:num_reqs].any()) + + needs_prompt_token_ids = ( + not self.no_penalties + or self.logits_processing_needs_token_ids[:num_reqs].any()) if needs_prompt_token_ids: # The prompt tokens are used only for applying penalties or # step pooling during the sampling/pooling process. diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index a5e1a1c9081..e3dcd9fe416 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -45,7 +45,7 @@ vllm_version_is) from vllm_ascend.worker.model_runner_v1 import NPUModelRunner -if not vllm_version_is("0.9.2"): +if not vllm_version_is("0.10.0"): from vllm.tasks import SupportedTask