vllm-project · MengqingCao · Apr 3, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 27, 2026
@@ -37,7 +37,7 @@ jobs:
     steps:
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=35141a7eeda941a60ad5a4956670c60fd5a77029
+          VLLM_COMMIT=14acf429ac08b6d538ca6feb3e06b6d13895804d
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
 
       - name: Checkout repository

@@ -27,7 +27,7 @@ RUN apt-get update -y && \
 
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 # For lint purpose, actually we need make a main2main matching.
-ARG VLLM_COMMIT=35141a7eeda941a60ad5a4956670c60fd5a77029
+ARG VLLM_COMMIT=14acf429ac08b6d538ca6feb3e06b6d13895804d
 RUN git init /vllm-workspace/vllm && \
     git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
     git -C /vllm-workspace/vllm checkout FETCH_HEAD

@@ -80,7 +80,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029]
+        vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
     uses: ./.github/workflows/_e2e_test.yaml

@@ -41,7 +41,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: 35141a7eeda941a60ad5a4956670c60fd5a77029
+      vllm: 14acf429ac08b6d538ca6feb3e06b6d13895804d
   changes:
     runs-on: linux-aarch64-a2b3-0
     outputs:
@@ -90,7 +90,7 @@ jobs:
     if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
     strategy:
       matrix:
-        vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029]
+        vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
@@ -102,7 +102,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029]
+        vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.

@@ -33,7 +33,7 @@ jobs:
     name: refresh codecov
     strategy:
       matrix:
-        vllm_version: [35141a7eeda941a60ad5a4956670c60fd5a77029]
+        vllm_version: [14acf429ac08b6d538ca6feb3e06b6d13895804d]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}

@@ -15,7 +15,9 @@
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 
-from tests.e2e.conftest import VllmRunner
+import pytest
+
+from tests.e2e.conftest import VllmRunner, wait_until_npu_memory_free
 
 
 def test_qwen3_moe_tp4_fp16():
@@ -33,6 +35,7 @@ def test_qwen3_moe_tp4_fp16():
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
+@wait_until_npu_memory_free(target_free_percentage=0.95)
 def test_qwen3_moe_ep4_fp16():
     example_prompts = [
         "Hello, my name is",
@@ -65,6 +68,10 @@ def test_qwen3_moe_tp2_w8a8():
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
+@pytest.mark.skip(
+    reason="Upstream changes caused the 310P Qwen 3.5 patch to become"
+    " invalid; YangShuai52 is currently working on the fix"
+)
 def test_qwen3_5_moe_tp4_fp16():
     example_prompts = [
         "Hello, my name is",

@@ -18,6 +18,8 @@
 import os
 import sys
 
+from tests.e2e.conftest import wait_until_npu_memory_free
+
 # Add 310p directory to sys.path
 current_dir = os.path.dirname(os.path.abspath(__file__))
 parent_dir = os.path.dirname(current_dir)  # 310p directory
@@ -27,11 +29,13 @@
 from test_utils import run_vl_model_test
 
 
+@wait_until_npu_memory_free(target_free_percentage=0.95)
 def test_qwen3_vl_8b_tp2_fp16():
     """Qwen3-VL-8B dual-card FP16 test"""
     run_vl_model_test(model_name="Qwen/Qwen3-VL-8B-Instruct", tensor_parallel_size=2, max_tokens=5)
 
 
+@wait_until_npu_memory_free(target_free_percentage=0.95)
 def test_qwen3_vl_32b_tp1_fp16():
     """Qwen3-VL-32B 4-card FP16 test"""
     run_vl_model_test(model_name="Qwen/Qwen3-VL-32B-Instruct", tensor_parallel_size=4, max_tokens=5)
@@ -15,6 +15,8 @@
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 
+import pytest
+
 from tests.e2e.conftest import VllmRunner
 
 
@@ -49,6 +51,10 @@ def test_qwen3_dense_tp1_w8a8():
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
+@pytest.mark.skip(
+    reason="Upstream changes caused the 310P Qwen 3.5 patch to become"
+    " invalid; YangShuai52 is currently working on the fix"
+)
 def test_qwen3_5_dense_tp1_fp16():
     example_prompts = [
         "Hello, my name is",

@@ -18,6 +18,8 @@
 import os
 import sys
 
+from tests.e2e.conftest import wait_until_npu_memory_free
+
 # Add 310p directory to sys.path
 current_dir = os.path.dirname(os.path.abspath(__file__))
 parent_dir = os.path.dirname(current_dir)  # 310p directory
@@ -27,6 +29,7 @@
 from test_utils import run_vl_model_test
 
 
+@wait_until_npu_memory_free(target_free_percentage=0.95)
 def test_qwen3_vl_8b_tp1_fp16():
     """Qwen3-VL-8B single-card FP16 test"""
     run_vl_model_test(model_name="Qwen/Qwen3-VL-8B-Instruct", tensor_parallel_size=1, max_tokens=5)
@@ -221,9 +221,9 @@ def test_aclgraph_logprobs_bitwise_batch_invariance_bs1_vs_bsN(monkeypatch: pyte
 
     # For batch invariance, disable custom all-reduce to ensure deterministic
     # all-reduce operations (custom all-reduce may not be deterministic)
-    from vllm_ascend.batch_invariant import vllm_is_batch_invariant
+    import vllm.envs as envs
 
-    disable_custom_ar = vllm_is_batch_invariant()
+    disable_custom_ar = envs.VLLM_BATCH_INVARIANT
 
     if disable_custom_ar:
         print(f"\n{'=' * 80}")

@@ -217,9 +217,9 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(monkeypatch: pytest.Monkey
 
     # For batch invariance, disable custom all-reduce to ensure deterministic
     # all-reduce operations (custom all-reduce may not be deterministic)
-    from vllm_ascend.batch_invariant import vllm_is_batch_invariant
+    import vllm.envs as envs
 
-    disable_custom_ar = vllm_is_batch_invariant()
+    disable_custom_ar = envs.VLLM_BATCH_INVARIANT
 
     if disable_custom_ar:
         print(f"\n{'=' * 80}")

@@ -126,7 +126,8 @@ def test_enable_batch_invariant_mode_no_backend(self):
     def test_init_batch_invariance(self, batch_invariant_enabled, has_backend, expected_logger_call):
         """Test init_batch_invariance under different conditions"""
         # Mock dependencies
-        batch_invariant.vllm_is_batch_invariant = MagicMock(return_value=batch_invariant_enabled)
+        import vllm.envs as envs
+        envs.VLLM_BATCH_INVARIANT = batch_invariant_enabled
         batch_invariant.HAS_TRITON = has_backend
         batch_invariant.HAS_ASCENDC_BATCH_INVARIANT = has_backend
         batch_invariant.override_envs_for_invariance = MagicMock()

diff --git a/vllm_ascend/_310p/model_runner_310p.py b/vllm_ascend/_310p/model_runner_310p.py
@@ -318,6 +318,7 @@ def _allocate_kv_cache_tensors(self, kv_cache_config: KVCacheConfig) -> dict[str
     def _prepare_input_ids(
         self,
         scheduler_output: SchedulerOutput,
+        num_reqs: int,
         total_num_scheduled_tokens: int,
         cu_num_tokens: np.ndarray,
     ) -> None:

@@ -129,10 +129,10 @@ def __init__(self, vllm_config: "VllmConfig"):
         # when enable_async_exponential is True, AscendSampler will be different from vllm Sampler,
         # which make batch_invariant mode not working.
         # so we disable async exponential when batch_invariant mode is enabled.
-        from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
+        import vllm.envs as envs
 
         self.enable_async_exponential = (
-            bool(additional_config.get("enable_async_exponential", False)) and not vllm_is_batch_invariant()
+            bool(additional_config.get("enable_async_exponential", False)) and not envs.VLLM_BATCH_INVARIANT
         )
 
         use_sparse = hasattr(vllm_config.model_config, "hf_text_config") and hasattr(

@@ -20,8 +20,8 @@
 
 import torch
 import torch_npu
+import vllm.envs as envs
 from vllm.logger import logger
-from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
 from vllm.triton_utils import HAS_TRITON
 
 # in case recursive call in reduce_sum.
@@ -136,7 +136,7 @@ def init_batch_invariance():
     Call this function early in your application, or set VLLM_BATCH_INVARIANT=1
     environment variable to enable automatically.
     """
-    if vllm_is_batch_invariant():
+    if envs.VLLM_BATCH_INVARIANT:
         if HAS_TRITON or HAS_ASCENDC_BATCH_INVARIANT:
             logger.info(
                 "Enabling batch-invariant mode for vLLM on Ascend NPU.",

@@ -5,8 +5,7 @@
 from vllm.v1.attention.backend import AttentionBackend  # type: ignore
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
-from vllm.v1.kv_offload.backends.cpu import CPUBackend
-from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
+from vllm.v1.kv_offload.cpu.manager import CPUOffloadingManager
 from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
 from vllm.v1.kv_offload.spec import OffloadingSpec
 from vllm.v1.kv_offload.worker.worker import OffloadingHandler
@@ -36,8 +35,9 @@ def get_manager(self) -> OffloadingManager:
             assert len(self.gpu_block_size) == 1
             gpu_block_size = self.gpu_block_size[0]
             offloaded_block_size = gpu_block_size * self.block_size_factor
-            self._manager = LRUOffloadingManager(
-                CPUBackend(block_size=offloaded_block_size, num_blocks=self.num_cpu_blocks),
+            self._manager = CPUOffloadingManager(
+                block_size=offloaded_block_size,
+                num_blocks=self.num_cpu_blocks,
                 enable_events=enable_events,
             )
         return self._manager

@@ -183,7 +183,7 @@ def mla_forward(
         attn_metadata = forward_context.attn_metadata[self.mla_attn.layer_name]
     else:
         attn_metadata = forward_context.attn_metadata
-    kv_cache = self.mla_attn.kv_cache[0]
+    kv_cache = self.mla_attn.kv_cache
     self.mla_attn.impl.forward(
         self.mla_attn.layer_name, hidden_states, kv_cache, attn_metadata, need_gather_q_kv, output
     )

@@ -135,7 +135,7 @@ def _forward_core(
         non_spec_token_indx = attn_metadata.non_spec_token_indx
         spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor  # noqa: E501
         non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor  # noqa: E501
-        self_kv_cache = self.kv_cache[0]
+        self_kv_cache = self.kv_cache
         conv_state = self_kv_cache[0].transpose(-1, -2)
         ssm_state = self_kv_cache[1]
         num_actual_tokens = attn_metadata.num_actual_tokens

@@ -125,7 +125,7 @@ def _forward_core(
         non_spec_token_indx = attn_metadata.non_spec_token_indx
         spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor  # noqa: E501
         non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor  # noqa: E501
-        self_kv_cache = self.kv_cache[0]
+        self_kv_cache = self.kv_cache
         conv_state = self_kv_cache[0].transpose(-1, -2)
         ssm_state = self_kv_cache[1]
         num_actual_tokens = attn_metadata.num_actual_tokens

@@ -44,8 +44,7 @@ def bind_kv_cache(
 
     # Bind kv_caches to forward context
     for layer_name, kv_cache in kv_caches.items():
-        # NOTE: Use list because of v0 PP virtual engine.
-        forward_context[layer_name].kv_cache = [kv_cache]
+        forward_context[layer_name].kv_cache = kv_cache
 
 
 utils.bind_kv_cache = bind_kv_cache
@@ -789,6 +789,19 @@ def _fix_incompatible_config(vllm_config: VllmConfig) -> None:
                 )
                 vllm_config.scheduler_config.max_num_partial_prefills = 1
 
+            # Disable async scheduling when speculative decoding is active.
+            # Ascend does not implement the GPU-side num_computed_tokens
+            # correction (update_num_computed_tokens_for_batch_change) required
+            # for async spec decode, which causes accuracy divergence.
+            if vllm_config.speculative_config is not None and getattr(
+                vllm_config.scheduler_config, "async_scheduling", False
+            ):
+                logger.warning(
+                    "Async scheduling with speculative decoding is not yet "
+                    "supported on Ascend. Disabling async scheduling."
+                )
+                vllm_config.scheduler_config.async_scheduling = False
+
         # ==================== 6. Speculative Config ====================
         if vllm_config.speculative_config:
             # Ascend automatically inherits main model quantization

@@ -1,5 +1,5 @@
 import torch
-from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
+import vllm.envs as envs
 from vllm.triton_utils import HAS_TRITON
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
@@ -101,7 +101,7 @@ def forward_native(self, logits, generators, k, p):
         """Override pytorch native implementation to torch_npu"""
         # when batch_invariant mode is enabled, we should use vllm's implementation.
         # or it will make batch_invariant mode not working.
-        if vllm_is_batch_invariant():
+        if envs.VLLM_BATCH_INVARIANT:
             return super().forward_native(logits, generators, k, p)
         logits = self.apply_top_k_top_p(logits, k, p)
         logits_to_return = None

@@ -91,6 +91,9 @@ class SpecDecodeBaseProposer(EagleProposer):
     def __init__(self, vllm_config: VllmConfig, device: torch.device, pass_hidden_states_to_model: bool, runner=None):
         super().__init__(vllm_config, device, runner)
 
+        # Assign runner before it's used in the methods below
+        self.runner = runner
+
         self.use_async_scheduling = self.vllm_config.scheduler_config.async_scheduling
         self.pass_hidden_states_to_model = pass_hidden_states_to_model
         self.decode_threshold = 1 + self.num_speculative_tokens
@@ -373,8 +376,8 @@ def dummy_run(
             common_attn_metadata = AscendCommonAttentionMetadata(
                 query_start_loc=self.query_start_loc.gpu[: num_reqs + 1],
                 query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs + 1],
-                seq_lens_cpu=self.runner.seq_lens.cpu,
-                seq_lens=self.runner.seq_lens.gpu[:num_reqs],
+                seq_lens_cpu=self.runner.optimistic_seq_lens_cpu,
+                seq_lens=self.runner.seq_lens[:num_reqs],
                 num_reqs=num_reqs,
                 num_actual_tokens=num_tokens,
                 num_input_tokens=num_tokens,
@@ -384,7 +387,7 @@ def dummy_run(
                 block_table_tensor=self.runner.input_batch.block_table[0].get_device_tensor()[:num_reqs],
                 # This is used to hold a position.
                 slot_mapping=self.runner.input_batch.block_table[0].slot_mapping.gpu,
-                positions=self.runner.positions.gpu,
+                positions=self.runner.positions,
                 attn_state=self.runner.attn_state,
                 decode_token_per_req=self.runner.decode_token_per_req,
                 max_seq_len=0,

@@ -259,7 +259,7 @@ def enable_custom_op():
     Enable lazy init for vllm_ascend_C to avoid early initialization of CANN's RTS component.
     Ensure that ASCEND_RT_VISIBLE_DEVICES can be dynamically modified before torch.npu.set_device().
     """
-    from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
+    import vllm.envs as envs
 
     global _CUSTOM_OP_ENABLED
 
@@ -271,7 +271,7 @@ def enable_custom_op():
     # FIXME(linfeng): Currently custom op compilation and execution are partially available
     # in ASCEND950 chip, we temporarily disable all custom ops. Please refer to
     # https://github.com/vllm-project/vllm-ascend/issues/7157 for latest update about custom op.
-    if vllm_is_batch_invariant() or get_ascend_device_type() == AscendDeviceType.A5:
+    if envs.VLLM_BATCH_INVARIANT or get_ascend_device_type() == AscendDeviceType.A5:
         _CUSTOM_OP_ENABLED = False
         return _CUSTOM_OP_ENABLED