vllm-project · 22dimensions · Mar 25, 2026 · Mar 25, 2026 · Mar 27, 2026 · Mar 28, 2026
@@ -18,7 +18,7 @@ on:
       continue_on_error:
         required: false
         type: boolean
-        default: false
+        default: true
 env:
   UV_INDEX_URL: http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
   UV_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi

@@ -37,7 +37,7 @@ jobs:
     steps:
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=35141a7eeda941a60ad5a4956670c60fd5a77029
+          VLLM_COMMIT=14acf429ac08b6d538ca6feb3e06b6d13895804d
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
 
       - name: Checkout repository

@@ -27,7 +27,7 @@ RUN apt-get update -y && \
 
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 # For lint purpose, actually we need make a main2main matching.
-ARG VLLM_COMMIT=35141a7eeda941a60ad5a4956670c60fd5a77029
+ARG VLLM_COMMIT=14acf429ac08b6d538ca6feb3e06b6d13895804d
 RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
     cd /vllm-workspace/vllm && \
     git checkout $VLLM_COMMIT

@@ -75,7 +75,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029, v0.18.0]
+        vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
     uses: ./.github/workflows/_e2e_test.yaml

@@ -41,7 +41,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: 35141a7eeda941a60ad5a4956670c60fd5a77029
+      vllm: 14acf429ac08b6d538ca6feb3e06b6d13895804d
   changes:
     runs-on: linux-aarch64-a2b3-0
     outputs:
@@ -90,7 +90,7 @@ jobs:
     if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
     strategy:
       matrix:
-        vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029, v0.18.0]
+        vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
@@ -102,7 +102,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029, v0.18.0]
+        vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d, v0.18.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.

@@ -33,7 +33,7 @@ jobs:
     name: refresh codecov
     strategy:
       matrix:
-        vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029]
+        vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}

@@ -129,7 +129,7 @@ def __init__(self, vllm_config: "VllmConfig"):
         # when enable_async_exponential is True, AscendSampler will be different from vllm Sampler,
         # which make batch_invariant mode not working.
         # so we disable async exponential when batch_invariant mode is enabled.
-        from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
+        from vllm_ascend.batch_invariant import vllm_is_batch_invariant
 
         self.enable_async_exponential = (
             bool(additional_config.get("enable_async_exponential", False)) and not vllm_is_batch_invariant()

@@ -688,7 +688,20 @@ def full_graph_pa(
             graph_params.handles[num_tokens].append(handle)
             return output
 
-    def _get_fia_params(self, key: torch.Tensor, value: torch.Tensor, attn_metadata: AscendMetadata):
+    def _get_fia_params(self, key: torch.Tensor, value: torch.Tensor, attn_metadata: AscendMetadata, kv_cache=None):
+        # PrefillNoCache doesn't need key_cache, but other modes do
+        # Only initialize/require cache for modes that actually use it
+        if attn_metadata.attn_state != AscendAttentionState.PrefillNoCache:
+            # Initialize cache from kv_cache if not already set (for DecodeOnly mode)
+            if self.key_cache is None and kv_cache is not None:
+                if isinstance(kv_cache, torch.Tensor) and kv_cache.dim() > 0 and kv_cache.shape[0] == 2:
+                    self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]
+                elif isinstance(kv_cache, (list, tuple)) and len(kv_cache) >= 2:
+                    self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]
+
+            if self.key_cache is None:
+                raise RuntimeError(f"key_cache is None in _get_fia_params for mode {attn_metadata.attn_state}. kv_cache={kv_cache}")
+
         if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
             block_size = 128
             block_table = None
@@ -766,6 +779,7 @@ def forward_fused_infer_attention(
         value: torch.Tensor,
         attn_metadata: AscendMetadata,
         output: torch.Tensor,
+        kv_cache=None,
     ):
         # we inherit ForwardContext in model runner v2, when enable model
         # runner v2, there is not capturing attribute in forward_context,
@@ -781,7 +795,7 @@ def forward_fused_infer_attention(
             and self.sinks is None
         ):
             return self._forward_fia_slidingwindow(query, attn_metadata, output)
-        key, value, block_size, block_table, actual_seq_lengths_kv = self._get_fia_params(key, value, attn_metadata)
+        key, value, block_size, block_table, actual_seq_lengths_kv = self._get_fia_params(key, value, attn_metadata, kv_cache)
         num_tokens = attn_metadata.actual_seq_lengths_q[-1]
         query = query[:num_tokens]
         if (
@@ -927,7 +941,7 @@ def forward_impl(
         ):
             output = self.forward_paged_attention(query, attn_metadata, output)
         else:
-            output = self.forward_fused_infer_attention(query, key, value, attn_metadata, output)
+            output = self.forward_fused_infer_attention(query, key, value, attn_metadata, output, kv_cache)
 
         return output
 
@@ -963,6 +977,16 @@ def forward(
         num_tokens = query.shape[0]
         if attn_metadata is None:
             return output.fill_(0)
+
+        # Initialize key_cache and value_cache from kv_cache if not already set.
+        # This is needed for DecodeOnly mode where key/value are None but we still
+        # need access to the cache for attention computation.
+        if self.key_cache is None and kv_cache is not None:
+            if isinstance(kv_cache, torch.Tensor) and kv_cache.dim() > 0 and kv_cache.shape[0] == 2:
+                self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]
+            elif isinstance(kv_cache, (list, tuple)) and len(kv_cache) >= 2:
+                self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]
+
         output_padded = None
         if key is not None and value is not None:
             output_padded = output

@@ -20,14 +20,28 @@
 
 import torch
 import torch_npu
+import vllm.envs as envs
 from vllm.logger import logger
-from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
 from vllm.triton_utils import HAS_TRITON
 
 # in case recursive call in reduce_sum.
 torch_sum = torch.sum
 
 
+def vllm_is_batch_invariant() -> bool:
+    """Check if batch-invariant mode is enabled.
+
+    This is a compatibility wrapper for the vllm function that was removed
+    in recent upstream vLLM refactoring.
+    """
+    # Try to access from envs module, fall back to environment variable
+    if hasattr(envs, 'VLLM_BATCH_INVARIANT'):
+        return bool(envs.VLLM_BATCH_INVARIANT)
+    else:
+        # Fallback to environment variable for older vLLM versions
+        return bool(int(os.getenv("VLLM_BATCH_INVARIANT", "0")))
+
+
 if HAS_TRITON:
     from vllm_ascend.ops.triton.batch_invariant.matmul import (
         addmm_batch_invariant,

@@ -5,12 +5,21 @@
 from vllm.v1.attention.backend import AttentionBackend  # type: ignore
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
-from vllm.v1.kv_offload.backends.cpu import CPUBackend
-from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
 from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
 from vllm.v1.kv_offload.spec import OffloadingSpec
 from vllm.v1.kv_offload.worker.worker import OffloadingHandler
 
+# Handle import compatibility with different vLLM versions
+try:
+    from vllm.v1.kv_offload.cpu.manager import CPUOffloadingManager
+except ModuleNotFoundError:
+    # Fallback for older vLLM versions where the path might be different
+    try:
+        from vllm.v1.kv_offload.cpu_manager import CPUOffloadingManager  # noqa: F401
+    except ModuleNotFoundError:
+        # If still not found, let it fail at usage time with better error message
+        CPUOffloadingManager = None  # type: ignore
+
 from vllm_ascend.kv_offload.cpu_npu import CpuNpuOffloadingHandler
 
 
@@ -36,8 +45,9 @@ def get_manager(self) -> OffloadingManager:
             assert len(self.gpu_block_size) == 1
             gpu_block_size = self.gpu_block_size[0]
             offloaded_block_size = gpu_block_size * self.block_size_factor
-            self._manager = LRUOffloadingManager(
-                CPUBackend(block_size=offloaded_block_size, num_blocks=self.num_cpu_blocks),
+            self._manager = CPUOffloadingManager(
+                block_size=offloaded_block_size,
+                num_blocks=self.num_cpu_blocks,
                 enable_events=enable_events,
             )
         return self._manager

@@ -183,7 +183,10 @@ def mla_forward(
         attn_metadata = forward_context.attn_metadata[self.mla_attn.layer_name]
     else:
         attn_metadata = forward_context.attn_metadata
-    kv_cache = self.mla_attn.kv_cache[forward_context.virtual_engine if vllm_version_is("0.18.0") else 0]
+    if vllm_version_is("0.18.0"):
+        kv_cache = self.mla_attn.kv_cache[forward_context.virtual_engine]
+    else:
+        kv_cache = self.mla_attn.kv_cache
     self.mla_attn.impl.forward(
         self.mla_attn.layer_name, hidden_states, kv_cache, attn_metadata, need_gather_q_kv, output
     )

@@ -135,7 +135,10 @@ def _forward_core(
         non_spec_token_indx = attn_metadata.non_spec_token_indx
         spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor  # noqa: E501
         non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor  # noqa: E501
-        self_kv_cache = self.kv_cache[forward_context.virtual_engine if vllm_version_is("0.18.0") else 0]
+        if vllm_version_is("0.18.0"):
+            self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+        else:
+            self_kv_cache = self.kv_cache
         conv_state = self_kv_cache[0].transpose(-1, -2)
         ssm_state = self_kv_cache[1]
         num_actual_tokens = attn_metadata.num_actual_tokens

@@ -125,7 +125,10 @@ def _forward_core(
         non_spec_token_indx = attn_metadata.non_spec_token_indx
         spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor  # noqa: E501
         non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor  # noqa: E501
-        self_kv_cache = self.kv_cache[forward_context.virtual_engine if vllm_version_is("0.18.0") else 0]
+        if vllm_version_is("0.18.0"):
+            self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+        else:
+            self_kv_cache = self.kv_cache
         conv_state = self_kv_cache[0].transpose(-1, -2)
         ssm_state = self_kv_cache[1]
         num_actual_tokens = attn_metadata.num_actual_tokens

@@ -44,8 +44,7 @@ def bind_kv_cache(
 
     # Bind kv_caches to forward context
     for layer_name, kv_cache in kv_caches.items():
-        # NOTE: Use list because of v0 PP virtual engine.
-        forward_context[layer_name].kv_cache = [kv_cache]
+        forward_context[layer_name].kv_cache = kv_cache
 
 
 utils.bind_kv_cache = bind_kv_cache
@@ -1,5 +1,5 @@
 import torch
-from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
+from vllm_ascend.batch_invariant import vllm_is_batch_invariant
 from vllm.triton_utils import HAS_TRITON
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler

@@ -91,6 +91,9 @@ class SpecDecodeBaseProposer(EagleProposer):
     def __init__(self, vllm_config: VllmConfig, device: torch.device, pass_hidden_states_to_model: bool, runner=None):
         super().__init__(vllm_config, device, runner)
 
+        # Assign runner before it's used in the methods below
+        self.runner = runner
+
         self.use_async_scheduling = self.vllm_config.scheduler_config.async_scheduling
         self.pass_hidden_states_to_model = pass_hidden_states_to_model
         self.decode_threshold = 1 + self.num_speculative_tokens

@@ -259,7 +259,7 @@ def enable_custom_op():
     Enable lazy init for vllm_ascend_C to avoid early initialization of CANN's RTS component.
     Ensure that ASCEND_RT_VISIBLE_DEVICES can be dynamically modified before torch.npu.set_device().
     """
-    from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
+    from vllm_ascend.batch_invariant import vllm_is_batch_invariant
 
     global _CUSTOM_OP_ENABLED