vllm-project · wangxiyuan · May 10, 2026 · May 6, 2026 · May 6, 2026 · May 6, 2026
@@ -27,10 +27,8 @@ RUN apt-get update -y && \
 
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 # For lint purpose, actually we need make a main2main matching.
-ARG VLLM_COMMIT=4d51588e2381018348f1022dfa3a7698899805b7
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.20.1
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 
 # # Install vLLM common dependencies
 RUN python3 -m pip install -r /vllm-workspace/vllm/requirements/common.txt --extra-index https://download.pytorch.org/whl/cpu/ && \

@@ -80,7 +80,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [4d51588e2381018348f1022dfa3a7698899805b7]
+        vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
     uses: ./.github/workflows/_e2e_test.yaml
@@ -102,7 +102,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        vllm_version: [4d51588e2381018348f1022dfa3a7698899805b7]
+        vllm_version: [v0.20.1]
     needs: [parse-trigger]
     if: ${{ needs.parse-trigger.outputs.allowed == 'true' }}
     uses: ./.github/workflows/_e2e_test.yaml

@@ -41,7 +41,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: 4d51588e2381018348f1022dfa3a7698899805b7
+      vllm: c7aa186d67b6f051680831418e957c67f34ba7a2
   changes:
     runs-on: linux-aarch64-a2b3-0
     container:
@@ -154,7 +154,7 @@ jobs:
     if: ${{ needs.lint.result == 'success' && needs.changes.outputs.has_tests == 'true' }}
     strategy:
       matrix:
-        vllm_version: [4d51588e2381018348f1022dfa3a7698899805b7]
+        vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1]
     uses: ./.github/workflows/_optional_smart_e2e.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
@@ -164,7 +164,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [4d51588e2381018348f1022dfa3a7698899805b7]
+        vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.

@@ -23,7 +23,7 @@ jobs:
     name: e2e-test
     strategy:
       matrix:
-        vllm_version: [4d51588e2381018348f1022dfa3a7698899805b7]
+        vllm_version: [v0.20.1]
         type: [full, light]
     uses: ./.github/workflows/_e2e_test.yaml
     with:

@@ -45,7 +45,7 @@ jobs:
           fail-fast: false
           matrix:
             part: [0, 1, 2, 3]
-            vllm: [4d51588e2381018348f1022dfa3a7698899805b7]
+            vllm: [v0.20.1]
         container:
           image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11
           env:

@@ -166,4 +166,4 @@ e2e-multicard-4-cards:
 - name: tests/e2e/multicard/4-cards/test_pipeline_parallel.py
   estimated_time: 679
 - name: tests/e2e/multicard/4-cards/test_profiling_chunk_performance.py
-  estimated_time: 1300
+  estimated_time: 1300
@@ -48,12 +48,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-# ARG VLLM_TAG=v0.19.1
-# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
-ARG VLLM_COMMIT=4d51588e2381018348f1022dfa3a7698899805b7
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.20.1
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

@@ -33,12 +33,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-# ARG VLLM_TAG=v0.19.1
-# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
-ARG VLLM_COMMIT=4d51588e2381018348f1022dfa3a7698899805b7
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.20.1
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

@@ -32,12 +32,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-# ARG VLLM_TAG=v0.19.1
-# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
-ARG VLLM_COMMIT=4d51588e2381018348f1022dfa3a7698899805b7
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.20.1
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

@@ -50,12 +50,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-# ARG VLLM_TAG=v0.19.1
-# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
-ARG VLLM_COMMIT=4d51588e2381018348f1022dfa3a7698899805b7
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.20.1
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

@@ -49,12 +49,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-# ARG VLLM_TAG=v0.19.1
-# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
-ARG VLLM_COMMIT=4d51588e2381018348f1022dfa3a7698899805b7
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.20.1
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

@@ -49,12 +49,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-# ARG VLLM_TAG=v0.19.1
-# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
-ARG VLLM_COMMIT=4d51588e2381018348f1022dfa3a7698899805b7
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.20.1
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

@@ -81,9 +81,9 @@
     # CANN image tag
     "cann_image_tag": "8.5.1-910b-ubuntu22.04-py3.11",
     # vLLM commit hash for main branch
-    "main_vllm_commit": "4d51588e2381018348f1022dfa3a7698899805b7",
+    "main_vllm_commit": "c7aa186d67b6f051680831418e957c67f34ba7a2",
     # vLLM tag for main branch
-    "main_vllm_tag": "v0.19.1",
+    "main_vllm_tag": "v0.20.1",
     # Python version for main branch
     "main_python_version": ">= 3.10, < 3.12",
     # CANN version for main branch

@@ -44,3 +44,6 @@ ignore_missing_imports = True
 [mypy-jiwer]
 ignore_missing_imports = True
 
+[mypy-vllm.v1.kv_offload.*]
+ignore_missing_imports = True
+
@@ -25,6 +25,7 @@
 from vllm.utils.network_utils import get_open_port
 
 from tests.e2e.conftest import RemoteOpenAIServer, VllmRunner
+from vllm_ascend.utils import vllm_version_is
 
 
 @patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
@@ -74,6 +75,7 @@ def test_qwen3_moe_distributed_aiv_tp2():
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
+@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1")
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [True])
 @patch.dict(os.environ, {"VLLM_USE_V2_MODEL_RUNNER": "1"})

@@ -22,13 +22,15 @@
 from vllm import SamplingParams
 
 from tests.e2e.conftest import VllmRunner
+from vllm_ascend.utils import vllm_version_is
 
 MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
 
 MAIN_MODELS = ["LLM-Research/Meta-Llama-3.1-8B-Instruct"]
 EGALE_MODELS = ["vllm-ascend/EAGLE-LLaMA3.1-Instruct-8B"]
 
 
+@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("enforce_eager", [True])
@@ -63,6 +65,7 @@ def test_qwen3_dense_eager_mode(
         runner.model.generate(prompts, sampling_params)
 
 
+@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1")
 @pytest.mark.parametrize("model", MAIN_MODELS)
 @pytest.mark.parametrize("eagle_model", EGALE_MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
@@ -101,6 +104,7 @@ def test_egale_spec_decoding(
         runner.model.generate(prompts, sampling_params)
 
 
+@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("enforce_eager", [False])

@@ -41,6 +41,7 @@
 from vllm.v1.utils import record_function_or_nullcontext
 
 from vllm_ascend.core.profiling_chunk_predictor import ProfilingChunkManager
+from vllm_ascend.utils import vllm_version_is
 
 
 class ProfilingChunkScheduler(Scheduler):
@@ -575,12 +576,16 @@ def schedule(self) -> SchedulerOutput:  # noqa: C901
                 if self.is_encoder_decoder and request.has_encoder_inputs and encoder_inputs_to_schedule:
                     num_encoder_tokens = sum(request.get_num_encoder_embeds(i) for i in encoder_inputs_to_schedule)
 
-                if self.scheduler_reserve_full_isl and not self.kv_cache_manager.can_fit_full_sequence(
-                    request,
-                    num_new_computed_tokens=num_new_local_computed_tokens,
-                    new_computed_blocks=new_computed_blocks,
-                    num_external_computed_tokens=num_external_computed_tokens,
-                    num_encoder_tokens=num_encoder_tokens,
+                if (
+                    vllm_version_is("0.20.1")
+                    and self.scheduler_reserve_full_isl
+                    and not self.kv_cache_manager.can_fit_full_sequence(
+                        request,
+                        num_new_computed_tokens=num_new_local_computed_tokens,
+                        new_computed_blocks=new_computed_blocks,
+                        num_external_computed_tokens=num_external_computed_tokens,
+                        num_encoder_tokens=num_encoder_tokens,
+                    )
                 ):
                     if request.has_encoder_inputs:
                         self.encoder_cache_manager.free(request)
@@ -595,6 +600,9 @@ def schedule(self) -> SchedulerOutput:  # noqa: C901
                     num_external_computed_tokens=num_external_computed_tokens,
                     delay_cache_blocks=load_kv_async,
                     num_encoder_tokens=num_encoder_tokens,
+                    **(
+                        {} if vllm_version_is("0.20.1") else {"full_sequence_must_fit": self.scheduler_reserve_full_isl}
+                    ),
                 )
 
                 if new_blocks is None:

@@ -19,6 +19,7 @@
 import vllm_ascend.patch.platform.patch_distributed  # noqa
 import vllm_ascend.patch.platform.patch_kv_cache_interface  # noqa
 import vllm_ascend.patch.platform.patch_kv_cache_utils  # noqa
+import vllm_ascend.patch.platform.patch_mla_prefill_backend  # noqa
 from vllm_ascend import envs
 from vllm_ascend.utils import is_310p