vllm-project · wxsIcey · Feb 5, 2026 · Feb 5, 2026 · Feb 6, 2026 · Feb 6, 2026
@@ -80,7 +80,7 @@ jobs:
           PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
           VLLM_WORKER_MULTIPROC_METHOD: spawn
         run: |
-          python3 .github/workflows/scripts/run_suite.py --suite e2e-singlecard-light --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
+          python3 .github/workflows/scripts/run_suite.py --suite e2e-singlecard-light --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --continue-on-error
 
   e2e-full:
     name: singlecard-full
@@ -145,7 +145,7 @@ jobs:
           VLLM_WORKER_MULTIPROC_METHOD: spawn
           PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
         run: |
-          python3 .github/workflows/scripts/run_suite.py --suite e2e-singlecard --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
+          python3 .github/workflows/scripts/run_suite.py --suite e2e-singlecard --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --continue-on-error
 
   e2e-2-cards-light:
     name: multicard-2-light
@@ -209,7 +209,7 @@ jobs:
         env:
           VLLM_WORKER_MULTIPROC_METHOD: spawn
         run: |
-          python3 .github/workflows/scripts/run_suite.py --suite e2e-2card-light --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
+          python3 .github/workflows/scripts/run_suite.py --suite e2e-2card-light --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --continue-on-error
 
   e2e-2-cards-full:
     name: multicard-2-full
@@ -273,7 +273,7 @@ jobs:
         env:
           VLLM_WORKER_MULTIPROC_METHOD: spawn
         run: |
-          python3 .github/workflows/scripts/run_suite.py --suite e2e-multicard-2-cards --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
+          python3 .github/workflows/scripts/run_suite.py --suite e2e-multicard-2-cards --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --continue-on-error
 
       - name: Run vllm-project/vllm-ascend test (non triton)
         if: ${{ inputs.type == 'full' && matrix.part == 0 }}
@@ -345,7 +345,7 @@ jobs:
         env:
           VLLM_WORKER_MULTIPROC_METHOD: spawn
         run: |
-          python3 .github/workflows/scripts/run_suite.py --suite e2e-multicard-4-cards --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
+          python3 .github/workflows/scripts/run_suite.py --suite e2e-multicard-4-cards --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --continue-on-error
 
   e2e_310p:
     name: 310p singlecard

@@ -37,7 +37,7 @@ jobs:
     steps:
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a
+          VLLM_COMMIT=c4df59ad43037a846eed353ce4c17dc264d18f4a
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
 
       - name: Checkout repository

@@ -27,7 +27,7 @@ RUN apt-get update -y && \
 
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 # For lint purpose, actually we need make a main2main matching.
-ARG VLLM_COMMIT=d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a
+ARG VLLM_COMMIT=c4df59ad43037a846eed353ce4c17dc264d18f4a
 RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
     cd /vllm-workspace/vllm && \
     git checkout $VLLM_COMMIT

@@ -75,7 +75,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a, v0.15.0]
+        vllm_version: [c4df59ad43037a846eed353ce4c17dc264d18f4a, v0.15.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
     uses: ./.github/workflows/_e2e_test.yaml

@@ -41,7 +41,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a
+      vllm: c4df59ad43037a846eed353ce4c17dc264d18f4a
   changes:
     runs-on: linux-aarch64-a2b3-0
     outputs:
@@ -87,7 +87,7 @@ jobs:
     if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
     strategy:
       matrix:
-        vllm_version: [d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a, v0.15.0]
+        vllm_version: [c4df59ad43037a846eed353ce4c17dc264d18f4a, v0.15.0]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
@@ -99,7 +99,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a, v0.15.0]
+        vllm_version: [c4df59ad43037a846eed353ce4c17dc264d18f4a, v0.15.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.

@@ -33,7 +33,7 @@ jobs:
     name: refresh codecov
     strategy:
       matrix:
-        vllm_version: [d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a]
+        vllm_version: [c4df59ad43037a846eed353ce4c17dc264d18f4a]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}

@@ -56,7 +56,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
 
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  |
 |-------------|--------------|------------------|-------------|--------------------|
-|     main    | d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a, v0.15.0 tag | >= 3.10, < 3.12   | 8.5.0 | 2.9.0 / 2.9.0 |
+|     main    | c4df59ad43037a846eed353ce4c17dc264d18f4a, v0.15.0 tag | >= 3.10, < 3.12   | 8.5.0 | 2.9.0 / 2.9.0 |
 
 ## Release cadence
 

@@ -25,8 +25,8 @@
 import torch
 from vllm.utils.network_utils import get_open_port
 
-from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
 from tests.e2e.conftest import wait_until_npu_memory_free
+from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
 
 MODELS = [
     # Offline data parallel mode will be not supported/useful for dense models
@@ -85,8 +85,7 @@ def _run_worker_process(
 
     # Import vLLM only after environment setup
     from vllm import LLM, SamplingParams
-    from vllm.distributed.parallel_state import (
-        destroy_distributed_environment, destroy_model_parallel)
+    from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
 
     # Apply hooks and run inference
     with _install_spies(counters):
@@ -208,8 +207,9 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
     expected_exec_model = (total_steps + 1 + 1) * dp_size
 
     assert (
-        num_execute_model == expected_exec_model
-    ), f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}"
+        expected_exec_model - dp_size < num_execute_model <= expected_exec_model
+    ), f"Model execution count mismatch. Expected range: [{expected_exec_model - dp_size}, \
+    {expected_exec_model}], Got: {num_execute_model}"
 
     # Metric 3: Dummy Runs (Warmup & Alignment)
     # vLLM synchronizes globally every 32 steps.
@@ -229,8 +229,8 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
     expected_dummy_run = (warmup_runs + padding_runs) * dp_size
 
     assert (
-        num_dummy_run == expected_dummy_run
-    ), f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}"
+        expected_dummy_run <= num_dummy_run <= expected_dummy_run + dp_size
+    ), f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}, Tolerance: ±{dp_size}"
 
     # Metric 4: Graph Replay (Inference Execution)
     # Replays happen for every aligned step across all graphs.

@@ -19,7 +19,7 @@
 
 import torch.fx as fx
 from torch._inductor.decomposition import select_decomp_table
-from vllm.compilation.fx_utils import OpOverload
+from torch._ops import OpOverload
 from vllm.config import get_current_vllm_config
 
 from vllm_ascend.compilation.compiler_interface import compile_fx

@@ -21,7 +21,7 @@
 import torch.nn as nn
 import torch_npu
 import vllm.config
-from vllm.compilation.fx_utils import OpOverload
+from torch._ops import OpOverload
 from vllm.config import ModelConfig, VllmConfig
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment)

@@ -1,13 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from unittest.mock import patch
+
 import vllm
 import vllm.config
 from vllm.lora.request import LoRARequest
-from unittest.mock import patch
 
 from tests.e2e.conftest import VllmRunner
-from vllm_ascend.utils import enable_custom_op
+from vllm_ascend.utils import enable_custom_op, vllm_version_is
 
 enable_custom_op()
 
@@ -23,12 +24,20 @@
 ###Response:<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 """  # noqa: E501
 
-EXPECTED_LORA_OUTPUT = [
-    "SELECT count(*) FROM candidate",
-    "SELECT count(*) FROM candidate",
-    "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1",  # noqa: E501
-    "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1",  # noqa: E501
-]
+if vllm_version_is("0.15.0"):
+    EXPECTED_LORA_OUTPUT = [
+        "SELECT count(*) FROM candidate",
+        "SELECT count(*) FROM candidate",
+        "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1",  # noqa: E501
+        "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1",  # noqa: E501
+    ]
+else:
+    EXPECTED_LORA_OUTPUT = [
+        "SELECT COUNT(*) FROM candidate",
+        "SELECT COUNT(*) FROM candidate",
+        "SELECT Poll_Source FROM candidate GROUP BY Poll_Source ORDER BY COUNT(*) DESC LIMIT 1;",
+        "SELECT t1.Poll_Source FROM candidate AS t1 JOIN people AS t2 ON t1.People_ID  =  t2.People_ID GROUP BY t1.Poll_Source ORDER BY COUNT(*) DESC LIMIT 1",  # noqa: E501
+    ]
 
 EXPECTED_BASE_MODEL_OUTPUT = [
     "SELECT COUNT(*) FROM candidate",