vllm-project · wangxiyuan · Jul 26, 2025 · Jul 25, 2025 · Jul 25, 2025 · Jul 26, 2025
@@ -37,7 +37,7 @@ on:
         # Current supported vLLM versions
         options:
           - main
-          - v0.9.2
+          - v0.10.0
           - v0.9.1
           - v0.7.3
       vllm-ascend-version:
@@ -163,7 +163,7 @@ jobs:
           repository: vllm-project/vllm
           path: ./vllm-empty
           # Please also update this when bump matched version
-          ref: ${{ github.event.inputs.vllm-version || 'v0.9.2' }}
+          ref: ${{ github.event.inputs.vllm-version || 'v0.10.0' }}
 
       - name: Install vllm-project/vllm from source
         working-directory: ./vllm-empty

@@ -51,7 +51,7 @@ jobs:
     strategy:
       matrix:
         include:
-          - vllm_branch: v0.9.2
+          - vllm_branch: v0.10.0
             vllm_ascend_branch: main
             vllm_use_v1: 1
       max-parallel: 1

@@ -81,7 +81,7 @@ jobs:
         VLLM_USE_MODELSCOPE: True
     strategy:
       matrix:
-        vllm_version: [main, v0.9.2]
+        vllm_version: [main, v0.10.0]
     steps:
       - name: Install packages
         run: |
@@ -137,7 +137,7 @@ jobs:
       max-parallel: 2
       matrix:
         os: [linux-arm64-npu-1]
-        vllm_version: [main, v0.9.2]
+        vllm_version: [main, v0.10.0]
     name: singlecard e2e test
     runs-on: ${{ matrix.os }}
     container:
@@ -216,7 +216,7 @@ jobs:
       max-parallel: 1
       matrix:
         os: [linux-arm64-npu-4]
-        vllm_version: [main, v0.9.2]
+        vllm_version: [main, v0.10.0]
     name: multicard e2e test
     runs-on: ${{ matrix.os }}
     container:

@@ -43,7 +43,7 @@ jobs:
       max-parallel: 2
       matrix:
         os: [linux-arm64-npu-1, linux-arm64-npu-4]
-        vllm_version: [main, v0.9.2]
+        vllm_version: [main, v0.10.0]
     name: vLLM Ascend long term test
     runs-on: ${{ matrix.os }}
     container:

@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.2
+ARG VLLM_TAG=v0.10.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \

@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.2
+ARG VLLM_TAG=v0.10.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \

@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.2
+ARG VLLM_TAG=v0.10.0
 
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.

@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.2
+ARG VLLM_TAG=v0.10.0
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \

@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.2
+ARG VLLM_TAG=v0.10.0
 
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.

@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.2
+ARG VLLM_TAG=v0.10.0
 
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.

@@ -77,7 +77,7 @@
     # CANN image tag
     'cann_image_tag': "8.1.rc1-910b-ubuntu22.04-py3.10",
     # vllm version in ci
-    'ci_vllm_version': 'v0.9.2',
+    'ci_vllm_version': 'v0.10.0',
 }
 
 # Add any paths that contain templates here, relative to this directory.

@@ -38,15 +38,15 @@ vllm_ascend
 
 In both **platform** and **worker** folder, there are several patch modules. They are used for patching different version of vLLM.
 
-- `patch_0_9_2`: This module is used for patching vLLM 0.9.2. The version is always the nearest version of vLLM. Once vLLM is released, we will drop this patch module and bump to a new version. For example, `patch_0_9_2` is used for patching vLLM 0.9.2.
+- `patch_0_10_0`: This module is used for patching vLLM 0.10.0. The version is always the nearest version of vLLM. Once vLLM is released, we will drop this patch module and bump to a new version. For example, `patch_0_10_0` is used for patching vLLM 0.10.0.
 - `patch_main`: This module is used for patching the code in vLLM main branch.
-- `patch_common`: This module is used for patching both vLLM 0.9.2 and vLLM main branch.
+- `patch_common`: This module is used for patching both vLLM 0.10.0 and vLLM main branch.
 
 ## How to write a patch
 
 Before writing a patch, following the principle above, we should patch the least code. If it's necessary, we can patch the code in either **platform** and **worker** folder. Here is an example to patch `distributed` module in vLLM.
 
-1. Decide which version of vLLM we should patch. For example, after analysis, here we want to patch both 0.9.2 and main of vLLM.
+1. Decide which version of vLLM we should patch. For example, after analysis, here we want to patch both 0.10.0 and main of vLLM.
 2. Decide which process we should patch. For example, here `distributed` belongs to the vLLM main process, so we should patch `platform`.
 3. Create the patch file in the right folder. The file should be named as `patch_{module_name}.py`. The example here is `vllm_ascend/patch/platform/patch_common/patch_distributed.py`.
 4. Write your patch code in the new file. Here is an example:
@@ -82,4 +82,4 @@ Before writing a patch, following the principle above, we should patch the least
 
 ## Limitation
 1. In V1 Engine, vLLM starts three kinds of process: Main process, EngineCore process and Worker process. Now vLLM Ascend only support patch the code in Main process and Worker process by default. If you want to patch the code runs in EngineCore process, you should patch EngineCore process entirely during setup, the entry code is here `vllm.v1.engine.core`. Please override `EngineCoreProc` and `DPEngineCoreProc` entirely.
-2. If you are running an edited vLLM code, the version of the vLLM may be changed automatically. For example, if you runs an edited vLLM based on v0.9.n, the version of vLLM may be change to v0.9.nxxx, in this case, the patch for v0.9.n in vLLM Ascend would not work as expect, because that vLLM Ascend can't distinguish the version of vLLM you're using. In this case, you can set the environment variable `VLLM_VERSION` to specify the version of vLLM you're using, then the patch for v0.9.2 should work.
+2. If you are running an edited vLLM code, the version of the vLLM may be changed automatically. For example, if you runs an edited vLLM based on v0.9.n, the version of vLLM may be change to v0.9.nxxx, in this case, the patch for v0.9.n in vLLM Ascend would not work as expect, because that vLLM Ascend can't distinguish the version of vLLM you're using. In this case, you can set the environment variable `VLLM_VERSION` to specify the version of vLLM you're using, then the patch for v0.10.0 should work.
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,6 +19,8 @@ requires = [
     "msgpack",
     "quart",
     "numba",
+    # Remove after https://github.com/vllm-project/vllm-ascend/issues/2034
+    "transformers<4.54.0",
 ]
 build-backend = "setuptools.build_meta"
 

@@ -13,6 +13,8 @@ setuptools-scm>=8
 torch>=2.5.1
 torchvision<0.21.0
 wheel
+# Remove after https://github.com/vllm-project/vllm-ascend/issues/2034
+transformers<4.54.0
 
 # requirements for disaggregated prefill
 msgpack

@@ -127,3 +127,19 @@ def test_models_topk() -> None:
                     enforce_eager=True,
                     gpu_memory_utilization=0.7) as vllm_model:
         vllm_model.generate(example_prompts, sampling_params)
+
+
+def test_models_prompt_logprobs() -> None:
+
+    example_prompts = [
+        "Hello, my name is",
+    ]
+
+    with VllmRunner("Qwen/Qwen2.5-0.5B-Instruct",
+                    max_model_len=8192,
+                    dtype="float16",
+                    enforce_eager=True,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        vllm_model.generate_greedy_logprobs(example_prompts,
+                                            max_tokens=5,
+                                            num_logprobs=1)
@@ -3,15 +3,12 @@
 import torch
 
 from tests.ut.base import TestBase
-from vllm_ascend.attention.attention_v1 import \
-    AscendAttentionBackendImpl092  # isort: skip
 from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend,
                                                 AscendAttentionBackendImpl,
                                                 AscendAttentionMetadataBuilder,
                                                 AscendAttentionState,
                                                 AscendMetadata,
                                                 CommonAttentionState)
-from vllm_ascend.utils import vllm_version_is
 
 
 class TestAscendAttentionBackend(TestBase):
@@ -20,12 +17,8 @@ def test_get_name(self):
         self.assertEqual(AscendAttentionBackend.get_name(), "ASCEND")
 
     def test_get_impl_cls(self):
-        if vllm_version_is("0.9.2"):
-            self.assertEqual(AscendAttentionBackend.get_impl_cls(),
-                             AscendAttentionBackendImpl092)
-        else:
-            self.assertEqual(AscendAttentionBackend.get_impl_cls(),
-                             AscendAttentionBackendImpl)
+        self.assertEqual(AscendAttentionBackend.get_impl_cls(),
+                         AscendAttentionBackendImpl)
 
     def test_get_metadata_cls(self):
         self.assertEqual(AscendAttentionBackend.get_metadata_cls(),

@@ -17,7 +17,7 @@
 
 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type
 
 import torch
 import torch_npu
@@ -31,7 +31,7 @@
 
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
-                               nd_to_nz_2d, nd_to_nz_spec, vllm_version_is)
+                               nd_to_nz_2d, nd_to_nz_spec)
 
 
 class AscendAttentionBackend(AttentionBackend):
@@ -43,8 +43,6 @@ def get_name() -> str:
 
     @staticmethod
     def get_impl_cls() -> Type["AscendAttentionBackendImpl"]:
-        if vllm_version_is("0.9.2"):
-            return AscendAttentionBackendImpl092
         return AscendAttentionBackendImpl
 
     @staticmethod
@@ -440,38 +438,6 @@ def forward(
         return output.view(num_tokens, self.hidden_size)
 
 
-class AscendAttentionBackendImpl092(AscendAttentionBackendImpl):
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        super().__init__(
-            num_heads=num_heads,
-            head_size=head_size,
-            scale=scale,
-            num_kv_heads=num_kv_heads,
-            alibi_slopes=alibi_slopes,
-            sliding_window=sliding_window,
-            kv_cache_dtype=kv_cache_dtype,
-            logits_soft_cap=logits_soft_cap,
-            attn_type=attn_type,
-            kv_sharing_target_layer_name=kv_sharing_target_layer_name,
-            use_irope=use_irope,
-        )
-
-
 def unified_ascend_attention_with_output(
     query: torch.Tensor,
     key: torch.Tensor,

@@ -16,7 +16,7 @@
 #
 
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type
 
 import numpy as np
 import torch
@@ -29,7 +29,7 @@
 
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
-                               nd_to_nz_2d, vllm_version_is)
+                               nd_to_nz_2d)
 
 
 class AscendAttentionTorchairBackend(AttentionBackend):
@@ -41,8 +41,6 @@ def get_name() -> str:
 
     @staticmethod
     def get_impl_cls() -> Type["AscendAttentionTorchairBackendImpl"]:
-        if vllm_version_is("0.9.2"):
-            return AscendAttentionTorchairBackendImpl092
         return AscendAttentionTorchairBackendImpl
 
     @staticmethod
@@ -489,36 +487,3 @@ def forward(
                 "to use ascend scheduler.")
 
         return output.view(num_tokens, self.hidden_size)
-
-
-class AscendAttentionTorchairBackendImpl092(AscendAttentionTorchairBackendImpl
-                                            ):
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        super().__init__(
-            num_heads=num_heads,
-            head_size=head_size,
-            scale=scale,
-            num_kv_heads=num_kv_heads,
-            alibi_slopes=alibi_slopes,
-            sliding_window=sliding_window,
-            kv_cache_dtype=kv_cache_dtype,
-            logits_soft_cap=logits_soft_cap,
-            attn_type=attn_type,
-            kv_sharing_target_layer_name=kv_sharing_target_layer_name,
-            use_irope=use_irope,
-        )