vllm-project · wangxiyuan · Apr 17, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 14, 2026
@@ -27,7 +27,7 @@ RUN apt-get update -y && \
 
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 # For lint purpose, actually we need make a main2main matching.
-ARG VLLM_COMMIT=5af684c31912232e5c89484c2e8259e0fac6c55b
+ARG VLLM_COMMIT=6f786f2c506cb07f4566771fdc62e640e2c4a176
 RUN git init /vllm-workspace/vllm && \
     git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
     git -C /vllm-workspace/vllm checkout FETCH_HEAD

@@ -80,7 +80,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0]
+        vllm_version: [6f786f2c506cb07f4566771fdc62e640e2c4a176, v0.19.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
     uses: ./.github/workflows/_e2e_test.yaml

@@ -41,7 +41,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: 5af684c31912232e5c89484c2e8259e0fac6c55b
+      vllm: 6f786f2c506cb07f4566771fdc62e640e2c4a176
   changes:
     runs-on: linux-aarch64-a2b3-0
     outputs:
@@ -92,7 +92,7 @@ jobs:
     if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
     strategy:
       matrix:
-        vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0]
+        vllm_version: [6f786f2c506cb07f4566771fdc62e640e2c4a176, v0.19.0]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
@@ -104,7 +104,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0]
+        vllm_version: [6f786f2c506cb07f4566771fdc62e640e2c4a176, v0.19.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.

@@ -23,7 +23,7 @@ jobs:
     name: e2e-test
     strategy:
       matrix:
-        vllm_version: [v0.18.0]
+        vllm_version: [v0.19.0]
         type: [full, light]
     uses: ./.github/workflows/_e2e_test.yaml
     with:

@@ -80,7 +80,7 @@
     # CANN image tag
     "cann_image_tag": "8.5.1-910b-ubuntu22.04-py3.11",
     # vLLM commit hash for main branch
-    "main_vllm_commit": "5af684c31912232e5c89484c2e8259e0fac6c55b",
+    "main_vllm_commit": "6f786f2c506cb07f4566771fdc62e640e2c4a176",
     # vLLM tag for main branch
     "main_vllm_tag": "v0.19.0",
     # Python version for main branch

@@ -22,6 +22,7 @@
 from vllm import SamplingParams
 
 from tests.e2e.conftest import VllmRunner
+from vllm_ascend.utils import vllm_version_is
 
 MODELS = ["Qwen/Qwen3-0.6B"]
 
@@ -63,6 +64,7 @@ def test_qwen3_dense_eager_mode(
         runner.model.generate(prompts, sampling_params)
 
 
+@pytest.mark.skipif(vllm_version_is("0.19.0"), reason="no need to support model_runner for v0.19.0")
 @pytest.mark.parametrize("model", MAIN_MODELS)
 @pytest.mark.parametrize("eagle_model", EGALE_MODELS)
 @pytest.mark.parametrize("max_tokens", [32])

@@ -23,6 +23,7 @@
     _validate_cu_seqlens,
     build_chunk_meta_device,
 )
+from vllm_ascend.utils import is_310p
 
 _GDN_CHUNK_SIZE = 64
 # Keep this aligned with solve_tril.LARGE_BLOCK_T in ops/triton/fla/solve_tril.py.
@@ -596,7 +597,7 @@ def _patched_build(
     return attn_metadata
 
 
-if not _IS_PATCHED:
+if not _IS_PATCHED and not is_310p():
     gdn_attn.GDNChunkedPrefillMetadata = GDNChunkedPrefillMetadata
     gdn_attn.GDNCausalConv1dHostMetadata = GDNCausalConv1dHostMetadata
     gdn_attn.GDNPrefillFallbackMeta = GDNPrefillFallbackMeta

@@ -404,7 +404,7 @@ def from_config(cls, config: dict[str, Any]) -> "AscendModelSlimConfig":
         return cls(config)
 
     @classmethod
-    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> str | None:
+    def override_quantization_method(cls, hf_quant_cfg, user_quant, hf_config: Any = None) -> str | None:
         if hf_quant_cfg is not None:
             quant_method = hf_quant_cfg.get("quant_method", None)
             if not quant_method and torch.npu.is_available():

@@ -2449,7 +2449,6 @@ def _dummy_run(
         if create_mixed_batch:
             raise NotImplementedError("create_mixed_batch is used for warmup deepgemm, vllm-ascend does not need it")
         elif uniform_decode:
-            assert not create_mixed_batch
             num_reqs = min(max_num_reqs, cdiv(num_tokens, max_query_len))
             num_scheduled_tokens_list = [max_query_len] * num_reqs
             if num_tokens % max_query_len != 0:
@@ -3473,12 +3472,20 @@ def _check_and_update_cudagraph_mode(
         with update_pass_config(self):
             super()._check_and_update_cudagraph_mode(attention_backends, kv_cache_groups)
 
+
+        capture_descs = self.cudagraph_dispatcher.get_capture_descs()
+        capture_sizes = sorted({
+            desc.num_tokens
+            for _, descs in capture_descs
+            for desc in descs
+        })
+
         # NOTE: Since aclgraph_batch_sizes cannot be determined until here,
         # we set the graph params right before initializing the keys.
         if self.use_aclgraph:
-            set_graph_params(self.cudagraph_batch_sizes)
+            set_graph_params(capture_sizes)
             if self.speculative_config:
-                set_draft_graph_params(self.cudagraph_batch_sizes)
+                set_draft_graph_params(capture_sizes)
 
     def capture_model(self) -> None:
         gpu_model_runner_cls = next((cls for cls in self.__class__.__mro__ if cls.__name__ == "GPUModelRunner"), None)

@@ -66,6 +66,7 @@ def propose(
         dummy_run: bool = False,
         skip_attn_for_dummy_run: bool = False,
         mm_inputs: tuple[list[torch.Tensor], torch.Tensor] | None = None,
+        is_profile: Any = None,
     ):
         """Override GPU EagleSpeculator.propose for Ascend NPUs,
         because npu attention metadata needs more information,
@@ -92,6 +93,7 @@ def propose(
                 dummy_run,
                 skip_attn_for_dummy_run,
                 mm_inputs,
+                is_profile=is_profile,
             )
 
     def generate_draft(