vllm-project · wangxiyuan · Nov 12, 2025 · Nov 5, 2025
@@ -36,7 +36,7 @@ jobs:
 
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=83f478bb19489b41e9d208b47b4bb5a95ac171ac
+          VLLM_COMMIT=2918c1b49c88c29783c86f78d2c4221cb9622379
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
 
       - name: Checkout repository

@@ -42,7 +42,7 @@ jobs:
   lint:
     uses: ./.github/workflows/pre-commit.yml
     with:
-      vllm: 83f478bb19489b41e9d208b47b4bb5a95ac171ac
+      vllm: 2918c1b49c88c29783c86f78d2c4221cb9622379
   changes:
     runs-on: ubuntu-latest
     outputs:
@@ -83,7 +83,7 @@ jobs:
         VLLM_USE_MODELSCOPE: True
     strategy:
       matrix:
-        vllm_version: [83f478bb19489b41e9d208b47b4bb5a95ac171ac, v0.11.0]
+        vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379, v0.11.0]
     steps:
       - name: Install packages
         run: |
@@ -138,7 +138,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [83f478bb19489b41e9d208b47b4bb5a95ac171ac, v0.11.0]
+        vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379, v0.11.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.

@@ -69,7 +69,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [83f478bb19489b41e9d208b47b4bb5a95ac171ac, v0.11.0]
+        vllm_version: [2918c1b49c88c29783c86f78d2c4221cb9622379, v0.11.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
     uses: ./.github/workflows/_e2e_test.yaml

@@ -42,7 +42,7 @@ The table below is the release compatibility matrix for vLLM Ascend release.
 For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  |
 |-------------|--------------|------------------|-------------|--------------------|
-|     main    | v0.11.0/83f478bb19489b41e9d208b47b4bb5a95ac171ac | >= 3.10, < 3.12   | 8.3.RC1 | 2.7.1 / 2.7.1 |
+|     main    | v0.11.0/2918c1b49c88c29783c86f78d2c4221cb9622379 | >= 3.10, < 3.12   | 8.3.RC1 | 2.7.1 / 2.7.1 |
 
 ## Release cadence
 

@@ -8,6 +8,9 @@
 from tests.ut.base import TestBase
 from vllm_ascend.utils import vllm_version_is
 
+init_cached_hf_modules_path = "vllm.utils.init_cached_hf_modules" if vllm_version_is(
+    "0.11.0") else "vllm.utils.import_utils.init_cached_hf_modules"
+
 
 class TestNPUWorker(TestBase):
 
@@ -53,7 +56,7 @@ def setUp(self):
     @patch("vllm_ascend.worker.worker_v1.init_ascend_config")
     @patch("vllm_ascend.worker.worker_v1.init_ascend_soc_version")
     @patch("vllm_ascend.worker.worker_v1.try_register_lib")
-    @patch("vllm.utils.init_cached_hf_modules")
+    @patch(init_cached_hf_modules_path)
     @patch("vllm_ascend.worker.worker_v1.NPUWorker._init_profiler")
     def test_init_npu_worker_normal_case(
         self,
@@ -115,7 +118,7 @@ def test_init_npu_worker_normal_case(
     @patch("vllm_ascend.worker.worker_v1.init_ascend_config")
     @patch("vllm_ascend.worker.worker_v1.init_ascend_soc_version")
     @patch("vllm_ascend.worker.worker_v1.try_register_lib")
-    @patch("vllm.utils.init_cached_hf_modules")
+    @patch(init_cached_hf_modules_path)
     @patch("vllm_ascend.worker.worker_v1.NPUWorker._init_profiler")
     def test_init_npu_worker_with_trust_remote_code(
         self,
@@ -160,7 +163,7 @@ def test_init_npu_worker_with_trust_remote_code(
     @patch("vllm_ascend.worker.worker_v1.init_ascend_config")
     @patch("vllm_ascend.worker.worker_v1.init_ascend_soc_version")
     @patch("vllm_ascend.worker.worker_v1.try_register_lib")
-    @patch("vllm.utils.init_cached_hf_modules")
+    @patch(init_cached_hf_modules_path)
     @patch("vllm_ascend.worker.worker_v1.NPUWorker._init_profiler")
     def test_init_npu_worker_with_custom_cache_dtype(
         self,

@@ -31,7 +31,14 @@
                               get_decode_context_model_parallel_rank,
                               get_decode_context_model_parallel_world_size)
 from vllm.forward_context import ForwardContext, get_forward_context
-from vllm.utils import cdiv
+
+from vllm_ascend.utils import vllm_version_is
+
+if vllm_version_is("0.11.0"):
+    from vllm.utils import cdiv
+else:
+    from vllm.utils.math_utils import cdiv
+
 from vllm.v1.attention.backends.utils import AttentionCGSupport
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.kv_cache_interface import AttentionSpec

@@ -22,7 +22,14 @@
 from vllm.logger import logger
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
-from vllm.utils import cdiv, round_down
+
+from vllm_ascend.utils import vllm_version_is
+
+if vllm_version_is("0.11.0"):
+    from vllm.utils import cdiv, round_down
+else:
+    from vllm.utils.math_utils import cdiv, round_down
+
 from vllm.v1.attention.backends.utils import AttentionCGSupport
 
 from vllm_ascend import envs

@@ -22,7 +22,14 @@
 from vllm.distributed.kv_events import KVEventBatch
 from vllm.logger import logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
-from vllm.utils import cdiv
+
+from vllm_ascend.utils import vllm_version_is
+
+if vllm_version_is("0.11.0"):
+    from vllm.utils import cdiv
+else:
+    from vllm.utils.math_utils import cdiv
+
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler

@@ -9,7 +9,15 @@
 import torch
 from vllm.distributed.kv_transfer.kv_connector.v1.base import \
     KVConnectorMetadata
-from vllm.utils import cdiv, logger
+from vllm.utils import logger
+
+from vllm_ascend.utils import vllm_version_is
+
+if vllm_version_is("0.11.0"):
+    from vllm.utils import cdiv
+else:
+    from vllm.utils.math_utils import cdiv
+
 from vllm.v1.core.sched.output import NewRequestData
 
 DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200  # 3.125 GiB

diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py
@@ -42,6 +42,7 @@
 from vllm.model_executor.models.utils import maybe_prefix
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
+from vllm_ascend.ascend_forward_context import set_ascend_forward_context
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, is_enable_nz,
                                vllm_version_is)
 
@@ -536,7 +537,11 @@ def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]:
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             pixel_values = image_input["pixel_values"].type(self.visual.dtype)
-            image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+            if vllm_version_is("0.11.0"):
+                image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+            else:
+                with set_ascend_forward_context(None, self.vllm_config):
+                    image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
 
         # Split concatenated embeddings for each image item.
         merge_size = self.visual.spatial_merge_size
@@ -553,7 +558,13 @@ def _process_video_input(self, video_input) -> tuple[torch.Tensor, ...]:
         else:
             pixel_values_videos = video_input["pixel_values_videos"].type(
                 self.visual.dtype)
-            video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
+            if vllm_version_is("0.11.0"):
+                video_embeds = self.visual(pixel_values_videos,
+                                           grid_thw=grid_thw)
+            else:
+                with set_ascend_forward_context(None, self.vllm_config):
+                    video_embeds = self.visual(pixel_values_videos,
+                                               grid_thw=grid_thw)
 
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size