vllm-project · wangxiyuan · Jun 11, 2025 · Jun 11, 2025
@@ -34,8 +34,7 @@ on:
         # Current supported vLLM versions
         options:
           - main
-          - v0.9.0.1
-          - v0.9.0
+          - v0.9.1
           - v0.7.3
       vllm-ascend-version:
         description: 'vllm-ascend version:'
@@ -159,7 +158,7 @@ jobs:
           repository: vllm-project/vllm
           path: ./vllm-empty
           # Please also update this when bump matched version
-          ref: ${{ github.event.inputs.vllm-version || 'v0.9.0' }}
+          ref: ${{ github.event.inputs.vllm-version || 'v0.9.1' }}
 
       - name: Install vllm-project/vllm from source
         working-directory: ./vllm-empty

@@ -50,7 +50,7 @@ jobs:
     strategy:
       matrix:
         include:
-          - vllm_branch: v0.9.0
+          - vllm_branch: v0.9.1
             vllm_ascend_branch: main
     container:
       image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10

@@ -33,6 +33,9 @@ on:
       - '!benchmarks/**'
       - 'tools/mypy.sh'
       - 'mypy.ini'
+      - '.github/workflows/*.ya?ml'
+      - '.github/workflows/actionlint.*'
+      - '.github/workflows/matchers/actionlint.json'
 
 # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
 # declared as "shell: bash -el {0}" on steps that need to be properly activated.
@@ -87,6 +90,13 @@ jobs:
           repository: vllm-project/vllm
           path: vllm-empty
 
+      - name: Actionlint Check
+        env:
+          SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086
+        run: |
+          echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+          tools/actionlint.sh -color
+
       - name: Install vllm-project/vllm from source
         working-directory: vllm-empty
         run: |
@@ -105,7 +115,7 @@ jobs:
       max-parallel: 2
       matrix:
         os: [linux-arm64-npu-1, linux-arm64-npu-4]
-        vllm_version: [main, v0.9.0]
+        vllm_version: [main, v0.9.1]
     concurrency:
       group: >
         ${{
@@ -192,6 +202,7 @@ jobs:
           fi
 
       - name: Run vllm-project/vllm-ascend test on V0 engine
+        if: ${{ github.event_name == 'schedule' }}
         env:
           VLLM_USE_V1: 0
         run: |

@@ -43,7 +43,7 @@ jobs:
       max-parallel: 2
       matrix:
         os: [linux-arm64-npu-1, linux-arm64-npu-4]
-        vllm_version: [main, v0.9.0]
+        vllm_version: [main, v0.9.1]
     name: vLLM Ascend long term test
     runs-on: ${{ matrix.os }}
     container:

@@ -41,7 +41,7 @@ jobs:
     if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }}
     strategy:
       matrix:
-        vllm_verison: [main, v0.9.0]
+        vllm_verison: [main, v0.9.1]
     name: vLLM Ascend prefilling decoding disaggregation test
     runs-on: linux-arm64-npu-static-8
 

@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.0
+ARG VLLM_TAG=v0.9.1
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \

@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.0
+ARG VLLM_TAG=v0.9.1
 
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.

@@ -14,8 +14,6 @@
                          set_current_vllm_config)
 from vllm.utils import direct_register_custom_op
 
-from vllm_ascend.utils import vllm_version_is
-
 global_counter = 0
 
 # create a library to hold the custom op
@@ -93,28 +91,14 @@ def test_simple_piecewise_compile():
         model = SillyModel(vllm_config=vllm_config, prefix="")
 
     inputs = torch.randn(100).npu()
-
-    if vllm_version_is("0.9.0"):
-        kwargs = {
-            "num_graphs_seen": 1,  # one graph for the model
-            "num_piecewise_graphs_seen": 5,  # 2 * num_layers + 1
-            "num_piecewise_capturable_graphs_seen": 3,  # 1 + num_layers
-            "num_backend_compilations":
-            3,  # num_piecewise_capturable_graphs_seen
-            "num_cudagraph_caputured":
-            6  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-        }
-    else:
-        kwargs = {
-            "num_graphs_seen": 1,  # one graph for the model
-            "num_piecewise_graphs_seen": 5,  # 2 * num_layers + 1
-            "num_piecewise_capturable_graphs_seen": 3,  # 1 + num_layers
-            "num_backend_compilations":
-            3,  # num_piecewise_capturable_graphs_seen
-            "num_cudagraph_captured":
-            6  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-        }
-
+    kwargs = {
+        "num_graphs_seen": 1,  # one graph for the model
+        "num_piecewise_graphs_seen": 5,  # 2 * num_layers + 1
+        "num_piecewise_capturable_graphs_seen": 3,  # 1 + num_layers
+        "num_backend_compilations": 3,  # num_piecewise_capturable_graphs_seen
+        "num_cudagraph_captured":
+        6  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    }
     with compilation_counter.expect(kwargs):
 
         model(inputs)

@@ -31,7 +31,6 @@
 from vllm.v1.structured_output import StructuredOutputManager
 
 from vllm_ascend.core.scheduler import AscendScheduler
-from vllm_ascend.utils import vllm_version_is
 
 EOS_TOKEN_ID = 50256
 
@@ -87,27 +86,15 @@ def create_scheduler(
     vllm_config = VllmConfig(scheduler_config=scheduler_config,
                              model_config=model_config,
                              cache_config=cache_config)
-
-    if vllm_version_is("0.9.0"):
-        kv_cache_config = KVCacheConfig(
-            num_blocks=10000,  # A large number of blocks to hold all requests
-            tensors={},
-            kv_cache_groups=[
-                KVCacheGroupSpec(['layer'],
-                                 FullAttentionSpec(16, 1, 1, torch.float32,
-                                                   False))
-            ],
-        )
-    else:
-        kv_cache_config = KVCacheConfig(
-            num_blocks=10000,  # A large number of blocks to hold all requests
-            kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])],
-            kv_cache_groups=[
-                KVCacheGroupSpec(['layer'],
-                                 FullAttentionSpec(16, 1, 1, torch.float32,
-                                                   False, None))
-            ],
-        )
+    kv_cache_config = KVCacheConfig(
+        num_blocks=10000,  # A large number of blocks to hold all requests
+        kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])],
+        kv_cache_groups=[
+            KVCacheGroupSpec(['layer'],
+                             FullAttentionSpec(16, 1, 1, torch.float32, False,
+                                               None))
+        ],
+    )
     cache_config.num_gpu_blocks = 10000
     return AscendScheduler(
         vllm_config,
@@ -135,27 +122,15 @@ def create_requests(num_requests: int,
         else:
             mm_position = None
             mm_inputs = None
-        if vllm_version_is("0.9.0"):
-            request = Request(
-                request_id=f"{i}",
-                prompt_token_ids=[i] * num_tokens,
-                sampling_params=sampling_params,
-                multi_modal_inputs=mm_inputs,
-                multi_modal_placeholders=mm_position,
-                multi_modal_hashes=None,
-                arrival_time=0,
-                eos_token_id=EOS_TOKEN_ID,
-            )
-        else:
-            request = Request(
-                request_id=f"{i}",
-                prompt_token_ids=[i] * num_tokens,
-                sampling_params=sampling_params,
-                multi_modal_inputs=mm_inputs,
-                multi_modal_placeholders=mm_position,
-                multi_modal_hashes=None,
-                eos_token_id=EOS_TOKEN_ID,
-            )
+        request = Request(
+            request_id=f"{i}",
+            prompt_token_ids=[i] * num_tokens,
+            sampling_params=sampling_params,
+            multi_modal_inputs=mm_inputs,
+            multi_modal_placeholders=mm_position,
+            multi_modal_hashes=None,
+            eos_token_id=EOS_TOKEN_ID,
+        )
         requests.append(request)
     return requests
 

@@ -31,8 +31,6 @@
 from vllm.logger import logger
 from vllm.utils import weak_ref_tensors
 
-from vllm_ascend.utils import vllm_version_is
-
 
 @dataclasses.dataclass
 class ConcreteSizeEntry:
@@ -206,11 +204,7 @@ def __call__(self, *args) -> Any:
             # to save memory
             entry.output = weak_ref_tensors(output)
             entry.aclgraph = aclgraph
-
-            if vllm_version_is("0.9.0"):
-                compilation_counter.num_cudagraph_caputured += 1
-            else:
-                compilation_counter.num_cudagraph_captured += 1
+            compilation_counter.num_cudagraph_captured += 1
 
             # important: we need to return the output, rather than
             # the weak ref of the output, so that pytorch can correctly

@@ -29,8 +29,6 @@
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
 
-from vllm_ascend.utils import vllm_version_is
-
 
 class AscendScheduler(Scheduler):
     """This Scheduler extends vllm's original v1 scheduler
@@ -129,12 +127,7 @@ def skip_cur_request():
                 continue
 
             assert num_new_tokens > 0
-
-            if vllm_version_is("0.9.0"):
-                blocks = computed_blocks.blocks
-            else:
-                blocks = computed_blocks.blocks[0]
-
+            blocks = computed_blocks.blocks[0]
             watermark = getattr(self.scheduler_config, "watermark", 0.01)
             if not self._check_watermark_for_prefill(request, num_new_tokens,
                                                      blocks, watermark):
@@ -330,14 +323,8 @@ def _check_watermark_for_prefill(self,
                                len(computed_blocks) * self.block_size)
         num_required_blocks = cdiv(num_new_tokens + num_computed_tokens,
                                    self.block_size)
-
-        if vllm_version_is("0.9.0"):
-            req_blocks = self.kv_cache_manager.single_type_manager.req_to_blocks[
-                request.request_id]
-        else:
-            req_blocks = self.kv_cache_manager.coordinator.get_blocks(
-                request.request_id)
-
+        req_blocks = self.kv_cache_manager.coordinator.get_blocks(
+            request.request_id)
         num_new_blocks = (num_required_blocks - len(req_blocks) -
                           len(computed_blocks))
         num_evictable_computed_blocks = sum(1 for blk in computed_blocks

@@ -24,9 +24,9 @@
 #           each worker's `__init__` function.
 #
 # Then in each kind of patch, there are three folders:
-# - patch_0_9_0: contains the patches applied when vllm version is 0.9.0.
+# - patch_0_9_1: contains the patches applied when vllm version is 0.9.1.
 # - patch_main: contains the patches applied when vllm version is main branch.
-# - patch_common: contains the patches applied in both 0.9.0 and main branch.
+# - patch_common: contains the patches applied in both 0.9.1 and main branch.
 #
 # Once a new patch is added in vllm-ascend, please add the patch description into this file as well.
 # ----------------------------------------------------------------------------------
@@ -35,17 +35,6 @@
 # --------------------------------
 # * Platform Patch:
 # =================
-# ** File: platform/patch_0_9_0/patch_distributed.py**
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#   1. `vllm.distributed.utils.stateless_init_torch_distributed_process_group()`
-#    Why:
-#       vllm distributed use gloo backend by default to initialize stateless process group, but we want to use hccl here
-#    How：
-#       Add hccl backend to the `stateless_init_torch_distributed_process_group`
-#    Related PR (if no, explain why):
-#       https://github.com/vllm-project/vllm/pull/18763
-#    Future Plan:
-#       Remove this patch once vllm is upgraded to 0.9.1
 # ** File: platform/patch_common/patch_distributed.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.distributed.parallel_state.destroy_model_parallel()`

@@ -17,8 +17,8 @@
 from vllm_ascend.utils import vllm_version_is
 
 # Import specific patches for different versions
-if vllm_version_is("0.9.0"):
-    from vllm_ascend.patch.platform import patch_0_9_0  # noqa: F401
+if vllm_version_is("0.9.1"):
+    from vllm_ascend.patch.platform import patch_0_9_1  # noqa: F401
     from vllm_ascend.patch.platform import patch_common  # noqa: F401
 else:
     from vllm_ascend.patch.platform import patch_common  # noqa: F401