vllm-project · wjunLu · Dec 31, 2025 · Dec 31, 2025 · Jan 4, 2026 · Jan 4, 2026
@@ -286,7 +286,7 @@ jobs:
 
         - name: Upload logs
           if: always()
-          uses: actions/upload-artifact@v4
+          uses: actions/upload-artifact@v6
           with:
             name: ${{ inputs.config_file_path }}-pod-logs
             path: /tmp/vllm*_logs.txt

@@ -103,14 +103,14 @@ jobs:
         uses: actions/checkout@v6
 
       - name: Download arm64 digests
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v7
         with:
           path: ${{ runner.temp }}/digests
           pattern: digests-${{ inputs.suffix }}-arm64
           merge-multiple: true
 
       - name: Download amd64 digests
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v7
         with:
           path: ${{ runner.temp }}/digests
           pattern: digests-${{ inputs.suffix }}-amd64

@@ -34,7 +34,7 @@ jobs:
     steps:
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=7157596103666ee7ccb7008acee8bff8a8ff1731
+          VLLM_COMMIT=6ef770df7c3f0d135c2f3a594c461949113aae91
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
 
       - name: Checkout repository

@@ -68,6 +68,12 @@ jobs:
           - name: multi-node-qwenw8a8-2node-eplb
             config_file_path: Qwen3-235B-W8A8-EPLB.yaml
             size: 2
+          - name: multi-node-deepseek-r1-w8a8-longseq
+            config_file_path: DeepSeek-R1-W8A8-longseq.yaml
+            size: 2
+          - name: multi-node-qwenw8a8-2node-longseq
+            config_file_path: Qwen3-235B-W8A8-longseq.yaml
+            size: 2
     uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
     with:
       soc_version: a3

@@ -26,6 +26,8 @@ on:
       - 'cmake/**'
       - 'CMakeLists.txt'
       - 'csrc/**'
+      # We should also trigger image build when nightly test related files are changed to ensure the image is valid for nightly tests
+      - 'tests/e2e/nightly/'
     types: [ labeled ]
   push:
     # Publish image when tagging, the Dockerfile in tag will be build as tag image

@@ -74,7 +74,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [7157596103666ee7ccb7008acee8bff8a8ff1731, v0.13.0]
+        vllm_version: [6ef770df7c3f0d135c2f3a594c461949113aae91, v0.13.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
     uses: ./.github/workflows/_e2e_test.yaml

@@ -42,7 +42,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: 7157596103666ee7ccb7008acee8bff8a8ff1731
+      vllm: 6ef770df7c3f0d135c2f3a594c461949113aae91
   changes:
     runs-on: linux-aarch64-a2-0
     outputs:
@@ -90,7 +90,7 @@ jobs:
         SOC_VERSION: ascend910b1
     strategy:
       matrix:
-        vllm_version: [7157596103666ee7ccb7008acee8bff8a8ff1731, v0.13.0]
+        vllm_version: [6ef770df7c3f0d135c2f3a594c461949113aae91, v0.13.0]
 
     steps:
       - name: Free up disk space
@@ -163,7 +163,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [7157596103666ee7ccb7008acee8bff8a8ff1731, v0.13.0]
+        vllm_version: [6ef770df7c3f0d135c2f3a594c461949113aae91, v0.13.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.

@@ -51,7 +51,7 @@ If you're using v0.7.3, don't forget to install [mindie-turbo](https://pypi.org/
 For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  |
 |-------------|--------------|------------------|-------------|--------------------|
-|     main    | 7157596103666ee7ccb7008acee8bff8a8ff1731, v0.13.0 tag | >= 3.10, < 3.12   | 8.3.RC2 | 2.8.0 / 2.8.0 |
+|     main    | 6ef770df7c3f0d135c2f3a594c461949113aae91, v0.13.0 tag | >= 3.10, < 3.12   | 8.3.RC2 | 2.8.0 / 2.8.0 |
 
 ## Release cadence
 

@@ -48,6 +48,7 @@ The following table lists additional configuration options available in vLLM Asc
 | `num_wait_worker_iterations`        | int  | `30`    | The forward iterations when the EPLB worker will finish CPU tasks. In our test default value 30 can cover most cases. |
 | `expert_map_record_path`            | str  | `None`  | Save the expert load calculation results to a new expert table in the specified directory.                |
 | `init_redundancy_expert`            | int  | `0`     | Specify redundant experts during initialization.                                                          |
+| `enable_kv_nz`                      | bool | `False` | Whether to enable kvcache NZ layout. This option only takes effects on models using MLA (e.g., DeepSeek).                                      |
 
 The details of each configuration option are as follows:
 
@@ -105,7 +106,8 @@ An example of additional configuration is as follows:
         "embedding_tensor_parallel_size": 8,
         "mlp_tensor_parallel_size": 8,
     },
+    "enable_kv_nz": False,
     "multistream_overlap_shared_expert": True,
-    "refresh": False,
+    "refresh": False
 }
 ```
@@ -28,7 +28,9 @@
 from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
 
 MODELS = [
-    "Qwen/Qwen3-0.6B",
+    # wjunlu: Offline data parallel mode will be not supported/useful for dense models
+    # see `https://github.com/vllm-project/vllm/pull/30739``
+    # "Qwen/Qwen3-0.6B",
     "vllm-ascend/DeepSeek-V2-Lite-W8A8",
 ]
 
@@ -153,7 +155,7 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
         "hidden_layers": multiprocessing.Value("i", -1),
     }
 
-    dp_size = 2
+    dp_size = 2 if "DeepSeek" in model else 1
     port = get_open_port()
 
     # Launch workers

@@ -0,0 +1,109 @@
+test_name: "test DeepSeek-R1-W8A8-longseq disaggregated_prefill"
+model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
+num_nodes: 2
+npu_per_node: 16
+env_common:
+  VLLM_USE_MODELSCOPE: true
+  HCCL_BUFFSIZE: 1024
+  SERVER_PORT: 8080
+  OMP_PROC_BIND: false
+  OMP_NUM_THREADS: 10
+  PYTORCH_NPU_ALLOC_CONF: expandable_segments:True
+  HCCL_DETERMINISTIC: True
+  TASK_QUEUE_ENABLE: 1
+  HCCL_OP_RETRY_ENABLE: "L0:0, L1:0"
+
+disaggregated_prefill:
+  enabled: true
+  prefiller_host_index: [0]
+  decoder_host_index: [1]
+
+deployment:
+  -
+    server_cmd: >
+      vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
+          --host 0.0.0.0
+          --port $SERVER_PORT
+          --data-parallel-size 1
+          --decode-context-parallel-size 8
+          --prefill-context-parallel-size 2
+          --tensor-parallel-size 8
+          --cp-kv-cache-interleave-size 128
+          --enforce-eager
+          --enable-expert-parallel
+          --seed 1024
+          --quantization ascend
+          --max-num-seqs 4
+          --max-model-len 32768
+          --max-num-batched-tokens 16384
+          --trust-remote-code
+          --gpu-memory-utilization 0.9
+          --enable-chunked-prefill
+          --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'
+          --kv-transfer-config
+          '{"kv_connector": "MooncakeConnectorV1",
+          "kv_role": "kv_producer",
+          "kv_port": "30000",
+          "engine_id": "0",
+          "kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
+          "kv_connector_extra_config": {
+                    "prefill": {
+                            "dp_size": 1,
+                            "tp_size": 8
+                    },
+                    "decode": {
+                            "dp_size": 2,
+                            "tp_size": 8
+                    }
+              }
+          }'
+
+  -
+    server_cmd: >
+      vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
+        --host 0.0.0.0
+        --port $SERVER_PORT
+        --data-parallel-size 2
+        --decode-context-parallel-size 2
+        --prefill-context-parallel-size 1
+        --tensor-parallel-size 8
+        --cp-kv-cache-interleave-size 128
+        --enable-expert-parallel
+        --seed 1024
+        --quantization ascend
+        --max-num-seqs 4
+        --max-model-len 32768
+        --max-num-batched-tokens 256
+        --trust-remote-code
+        --gpu-memory-utilization 0.9
+        --compilation_config '{"cudagraph_capture_sizes":[4,8,12,16],"cudagraph_mode": "FULL_DECODE_ONLY"}'
+        --enable-chunked-prefill
+        --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'
+        --kv-transfer-config
+        '{"kv_connector": "MooncakeConnectorV1",
+        "kv_role": "kv_consumer",
+        "kv_port": "30100",
+        "engine_id": "1",
+        "kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
+        "kv_connector_extra_config": {
+                  "prefill": {
+                          "dp_size": 1,
+                          "tp_size": 8
+                  },
+                  "decode": {
+                          "dp_size": 2,
+                          "tp_size": 8
+                  }
+            }
+        }'
+
+benchmarks:
+  acc:
+    case_type: accuracy
+    dataset_path: vllm-ascend/gsm8k
+    request_conf: vllm_api_general_chat
+    dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
+    max_out_len: 32768
+    batch_size: 512
+    baseline: 95
+    threshold: 5
@@ -0,0 +1,93 @@
+test_name: "test Qwen3-235B-A22B-W8A8-longseq disaggregated_prefill"
+model: "vllm-ascend/Qwen3-235B-A22B-W8A8"
+num_nodes: 2
+npu_per_node: 16
+env_common:
+  VLLM_USE_MODELSCOPE: true
+  OMP_PROC_BIND: false
+  OMP_NUM_THREADS: 100
+  HCCL_BUFFSIZE: 1024
+  SERVER_PORT: 8080
+  NUMEXPR_MAX_THREADS: 128
+disaggregated_prefill:
+  enabled: true
+  prefiller_host_index: [0]
+  decoder_host_index: [1]
+
+deployment:
+  -
+    server_cmd: >
+        vllm serve "vllm-ascend/Qwen3-235B-A22B-W8A8"
+        --host 0.0.0.0
+        --port $SERVER_PORT
+        --data-parallel-size 1
+        --decode-context-parallel-size 2
+        --prefill-context-parallel-size 2
+        --tensor-parallel-size 8
+        --cp-kv-cache-interleave-size 128
+        --seed 1024
+        --enforce-eager
+        --enable-expert-parallel
+        --max-num-seqs 16
+        --max-model-len 8192
+        --max-num-batched-tokens 8192
+        --quantization ascend
+        --trust-remote-code
+        --no-enable-prefix-caching
+        --gpu-memory-utilization 0.9
+        --kv-transfer-config
+        '{"kv_connector": "MooncakeConnectorV1",
+        "kv_role": "kv_producer",
+        "kv_port": "30000",
+        "engine_id": "0",
+        "kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
+        "kv_connector_extra_config": {
+                  "prefill": {
+                          "dp_size": 1,
+                          "tp_size": 8
+                  },
+                  "decode": {
+                          "dp_size": 2,
+                          "tp_size": 8
+                  }
+            }
+        }'
+
+  -
+    server_cmd: >
+        vllm serve "vllm-ascend/Qwen3-235B-A22B-W8A8"
+        --host 0.0.0.0
+        --port $SERVER_PORT
+        --data-parallel-size 2
+        --decode-context-parallel-size 2
+        --prefill-context-parallel-size 1
+        --tensor-parallel-size 8
+        --cp-kv-cache-interleave-size 128
+        --seed 1024
+        --quantization ascend
+        --max-num-seqs 16
+        --max-model-len 8192
+        --max-num-batched-tokens 8192
+        --enable-expert-parallel
+        --trust-remote-code
+        --no-enable-prefix-caching
+        --gpu-memory-utilization 0.9
+        --compilation_config '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
+        --kv-transfer-config
+        '{"kv_connector": "MooncakeConnectorV1",
+        "kv_role": "kv_consumer",
+        "kv_port": "30100",
+        "engine_id": "1",
+        "kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
+        "kv_connector_extra_config": {
+                  "prefill": {
+                          "dp_size": 1,
+                          "tp_size": 8
+                  },
+                  "decode": {
+                          "dp_size": 2,
+                          "tp_size": 8
+                  }
+            }
+        }'
+benchmarks:
@@ -136,10 +136,10 @@ def test_token_dispatcher_with_all_gather(
         expert_map=expert_map,
         apply_router_weight_on_input=apply_router_weight_on_input)
 
-    sorted_hidden_states = dispatch_output["hidden_states"]
-    group_list = dispatch_output["group_list"]
-    group_list_type = dispatch_output.get("group_list_type", 1)
-    context_metadata = dispatch_output["context_metadata"]
+    sorted_hidden_states = dispatch_output.hidden_states
+    group_list = dispatch_output.group_list
+    group_list_type = dispatch_output.group_list_type
+    context_metadata = dispatch_output.context_metadata
 
     expert_output = apply_mlp(hidden_states=sorted_hidden_states,
                               w1=w1_local,
@@ -155,7 +155,7 @@ def test_token_dispatcher_with_all_gather(
     torch_output = torch_moe(a, w1, w2, topk_weights, topk_ids, topk,
                              expert_map)
 
-    torch.testing.assert_close(combined_output,
+    torch.testing.assert_close(combined_output.routed_out,
                                torch_output,
                                atol=4e-2,
                                rtol=1)
@@ -216,11 +216,11 @@ def test_token_dispatcher_with_all_gather_quant(
             apply_router_weight_on_input=apply_router_weight_on_input,
             with_quant=True)
 
-        sorted_hidden_states = dispatch_output["hidden_states"]
-        group_list = dispatch_output["group_list"]
-        group_list_type = dispatch_output.get("group_list_type", 1)
-        dynamic_scale = dispatch_output["dynamic_scale"]
-        context_metadata = dispatch_output["context_metadata"]
+        sorted_hidden_states = dispatch_output.hidden_states
+        group_list = dispatch_output.group_list
+        group_list_type = dispatch_output.group_list_type
+        dynamic_scale = dispatch_output.dynamic_scale
+        context_metadata = dispatch_output.context_metadata
 
         expert_output = unified_apply_mlp(hidden_states=sorted_hidden_states,
                                           w1=w1,
@@ -235,7 +235,7 @@ def test_token_dispatcher_with_all_gather_quant(
             hidden_states=expert_output,
             context_metadata=context_metadata,
             bias=None)
-        assert combined_output.shape == (m, k)
+        assert combined_output.routed_out.shape == (m, k)
         gc.collect()
         torch.npu.empty_cache()
         torch.npu.reset_peak_memory_stats()