From a130214e3fa916be0f313a32ca05eb8a3f321b2c Mon Sep 17 00:00:00 2001 From: leo-pony Date: Mon, 16 Mar 2026 21:12:56 +0800 Subject: [PATCH 01/15] upgrade to 0316 Signed-off-by: leo-pony --- .github/workflows/bot_pr_create.yaml | 2 +- .github/workflows/dockerfiles/Dockerfile.lint | 2 +- .github/workflows/pr_test_full.yaml | 2 +- .github/workflows/pr_test_light.yaml | 6 +++--- .github/workflows/schedule_codecov_refresh.yaml | 2 +- docs/source/community/versioning_policy.md | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml index 357a1e32a03..80a9bcbdd79 100644 --- a/.github/workflows/bot_pr_create.yaml +++ b/.github/workflows/bot_pr_create.yaml @@ -37,7 +37,7 @@ jobs: steps: - name: Get vLLM version run: | - VLLM_COMMIT=4497431df654e46fb1fb5e64bf8611e762ae5d87 + VLLM_COMMIT=43a73f853bac76e6c95c629e4aaa0858f610eb11 echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV" - name: Checkout repository diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index 9116b5a692c..8ac69312305 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=4497431df654e46fb1fb5e64bf8611e762ae5d87 +ARG VLLM_COMMIT=43a73f853bac76e6c95c629e4aaa0858f610eb11 RUN git clone $VLLM_REPO /vllm-workspace/vllm && \ cd /vllm-workspace/vllm && \ git checkout $VLLM_COMMIT diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index b7f35825cef..faa3c9461e6 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0] + vllm_version: [43a73f853bac76e6c95c629e4aaa0858f610eb11, v0.17.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 0ce86dfa626..503ce49e708 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: 4497431df654e46fb1fb5e64bf8611e762ae5d87 + vllm: 43a73f853bac76e6c95c629e4aaa0858f610eb11 changes: runs-on: linux-aarch64-a2b3-0 outputs: @@ -90,7 +90,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0] + vllm_version: [43a73f853bac76e6c95c629e4aaa0858f610eb11, v0.17.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -102,7 +102,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0] + vllm_version: [43a73f853bac76e6c95c629e4aaa0858f610eb11, v0.17.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml index a50a9c164ac..da8e1e4f3f6 100644 --- a/.github/workflows/schedule_codecov_refresh.yaml +++ b/.github/workflows/schedule_codecov_refresh.yaml @@ -33,7 +33,7 @@ jobs: name: refresh codecov strategy: matrix: - vllm_version: [4497431df654e46fb1fb5e64bf8611e762ae5d87] + vllm_version: [43a73f853bac76e6c95c629e4aaa0858f610eb11] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 9bab96b970b..3b3a91551cf 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | 4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | +| main | 43a73f853bac76e6c95c629e4aaa0858f610eb11, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | ## Release cadence From 3bae902fa05aa1c5f6e187b67f04557caf0ffdda Mon Sep 17 00:00:00 2001 From: leo-pony Date: Tue, 17 Mar 2026 03:55:41 +0000 Subject: [PATCH 02/15] fix: adapt to upstream vLLM changes (4034c3d..43a73f8) Root causes: - CompilationConfig.compile_ranges_split_points renamed to compile_ranges_endpoints (4b87ffb) - torch.accelerator.memory_stats/reserved not supported on NPU (747b068) - get_attn_backend() removed block_size parameter (77a7345) Upstream commit range: 4034c3d32e30d01639459edd3ab486f56993876d..43a73f853bac76e6c95c629e4aaa0858f610eb11 Signed-off-by: leo-pony Co-Authored-By: Claude Code Signed-off-by: leo-pony --- vllm_ascend/ascend_config.py | 32 ++++++++++++++----- vllm_ascend/platform.py | 9 ++++++ vllm_ascend/worker/model_runner_v1.py | 44 +++++++++++++-------------- vllm_ascend/worker/v2/README.md | 2 +- 4 files changed, 56 insertions(+), 31 deletions(-) diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 8dd634279c4..b5ff1ec2d44 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -181,30 +181,48 @@ def _construct_weight_prefetch_config(self, additional_config): stacklevel=2, ) + @staticmethod + def _get_compile_ranges(compilation_config): + from vllm_ascend.utils import vllm_version_is + + if vllm_version_is("0.17.0"): + return compilation_config.compile_ranges_split_points + else: + return compilation_config.compile_ranges_endpoints + + @staticmethod + def _set_compile_ranges(compilation_config, value): + from vllm_ascend.utils import vllm_version_is + + if vllm_version_is("0.17.0"): + compilation_config.compile_ranges_split_points = value + else: + compilation_config.compile_ranges_endpoints = value + def update_compile_ranges_split_points(self): vllm_config = self.vllm_config if self.ascend_compilation_config.enable_npugraph_ex: if self.ascend_compilation_config.fuse_allreduce_rms: from vllm_ascend.compilation.passes.allreduce_rmsnorm_fusion_pass import ALLREDUCE_NORM_FUSE_THRESHOLD - new_compile_ranges_split_points = vllm_config.compilation_config.compile_ranges_split_points + new_compile_ranges_split_points = self._get_compile_ranges(vllm_config.compilation_config) new_compile_ranges_split_points.append(ALLREDUCE_NORM_FUSE_THRESHOLD) new_compile_ranges_split_points = sorted(new_compile_ranges_split_points) - vllm_config.compilation_config.compile_ranges_split_points = new_compile_ranges_split_points + self._set_compile_ranges(vllm_config.compilation_config, new_compile_ranges_split_points) logger.debug( "set compile_ranges_split_points to " "{new_compile_ranges_split_points} for matmul and allreduce fusion" ) else: - new_compile_ranges_split_points = vllm_config.compilation_config.compile_ranges_split_points + new_compile_ranges_split_points = self._get_compile_ranges(vllm_config.compilation_config) if vllm_config.additional_config.get("ascend_compilation_config", {}).get("fuse_allreduce_rms", True): from vllm_ascend.compilation.passes.allreduce_rmsnorm_fusion_pass import ALLREDUCE_NORM_FUSE_THRESHOLD - new_compile_ranges_split_points = vllm_config.compilation_config.compile_ranges_split_points + new_compile_ranges_split_points = self._get_compile_ranges(vllm_config.compilation_config) new_compile_ranges_split_points.append(ALLREDUCE_NORM_FUSE_THRESHOLD) new_compile_ranges_split_points = sorted(new_compile_ranges_split_points) - vllm_config.compilation_config.compile_ranges_split_points = new_compile_ranges_split_points + self._set_compile_ranges(vllm_config.compilation_config, new_compile_ranges_split_points) logger.debug( "set compile_ranges_split_points to " "{new_compile_ranges_split_points} for matmul and allreduce fusion" @@ -218,9 +236,9 @@ def update_compile_ranges_split_points(self): sp_threshold = get_sp_threshold(vllm_config) new_compile_ranges_split_points.append(sp_threshold) logger.debug(f"add {sp_threshold} to compile_ranges_split_points for sequence parallelism") - if len(new_compile_ranges_split_points) > len(vllm_config.compilation_config.compile_ranges_split_points): + if len(new_compile_ranges_split_points) > len(self._get_compile_ranges(vllm_config.compilation_config)): new_compile_ranges_split_points = sorted(new_compile_ranges_split_points) - vllm_config.compilation_config.compile_ranges_split_points = new_compile_ranges_split_points + self._set_compile_ranges(vllm_config.compilation_config, new_compile_ranges_split_points) class FinegrainedTPConfig: diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 55e1408ca2f..25333345914 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -27,6 +27,15 @@ from vllm.logger import logger from vllm.platforms import Platform, PlatformEnum +# Monkey-patch torch.accelerator memory APIs for NPU compatibility. +# Upstream vLLM (commit 747b068) replaced current_platform.memory_stats() +# with torch.accelerator.memory_stats(), but torch.accelerator does not +# properly delegate to NPU. We redirect to torch.npu.* equivalents. +if hasattr(torch, "npu"): + torch.accelerator.memory_stats = torch.npu.memory_stats # type: ignore[attr-defined] + torch.accelerator.memory_reserved = torch.npu.memory_reserved # type: ignore[attr-defined] + torch.accelerator.reset_peak_memory_stats = torch.npu.reset_peak_memory_stats # type: ignore[attr-defined] + # todo: please remove it when solve cuda hard code in vllm os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "1" diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 96f0f78b1a6..752d08771b0 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -278,30 +278,28 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.is_multimodal_model = self.model_config.is_multimodal_model self.block_size = vllm_config.cache_config.block_size # Set up Attention - self.use_sparse = hasattr(vllm_config.model_config, "hf_text_config") and hasattr( - vllm_config.model_config.hf_text_config, "index_topk" - ) - if self.use_sparse: - self.sparse_head_dim = ( - self.model_config.hf_text_config.kv_lora_rank, - self.model_config.hf_text_config.qk_rope_head_dim, - self.model_config.hf_text_config.index_head_dim, + self.use_sparse = hasattr(self.vllm_config.model_config.hf_text_config, "index_topk") + from vllm_ascend.utils import vllm_version_is + + if vllm_version_is("0.17.0"): + self.attn_backend = get_attn_backend( + 0, + self.dtype, + None, + self.block_size, + use_mla=self.model_config.use_mla, + use_sparse=self.use_sparse, + use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm, + ) + else: + self.attn_backend = get_attn_backend( + 0, + self.dtype, + None, + use_mla=self.model_config.use_mla, + use_sparse=self.use_sparse, + use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm, ) - # dsa c8 - self.use_sparse_c8_indexer = self.ascend_config.enable_sparse_c8 - if self.use_sparse_c8_indexer: - self.c8_k_cache_dtype = torch.int8 - self.c8_k_scale_cache_dtype = torch.float16 - - self.attn_backend = get_attn_backend( - 0, - self.dtype, - None, - self.block_size, - use_mla=self.model_config.use_mla, - use_sparse=self.use_sparse, - use_mm_prefix=self.model_config is not None and self.model_config.is_mm_prefix_lm, - ) try: self.dcp_size = get_dcp_group().world_size diff --git a/vllm_ascend/worker/v2/README.md b/vllm_ascend/worker/v2/README.md index 1c1309e6a48..9436945d6c5 100644 --- a/vllm_ascend/worker/v2/README.md +++ b/vllm_ascend/worker/v2/README.md @@ -5,5 +5,5 @@ This directory contains the new model runner which is under active development. please see [Model Runner V2](https://github.com/vllm-project/vllm-ascend/issues/5208) to get specific plans. -supported vllm version: main@4034c3d32e30d01639459edd3ab486f56993876d +supported vllm version: main@43a73f853bac76e6c95c629e4aaa0858f610eb11 related PR: From 60ead9115714b0ba794e36f1953b53165ef6af3d Mon Sep 17 00:00:00 2001 From: leo-pony Date: Tue, 17 Mar 2026 03:59:21 +0000 Subject: [PATCH 03/15] set continue run when failed Signed-off-by: leo-pony --- .github/workflows/_e2e_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 5404cfb0562..bc127c0f7a9 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -18,7 +18,7 @@ on: continue_on_error: required: false type: boolean - default: false + default: true env: UV_INDEX_URL: http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple UV_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi From 508959901c2f0edf57e38d17eb81be544a8386a3 Mon Sep 17 00:00:00 2001 From: leo-pony Date: Tue, 17 Mar 2026 04:48:02 +0000 Subject: [PATCH 04/15] fix: restore use_sparse_c8_indexer init and guard xlite config attrs - Restore use_sparse_c8_indexer initialization in NPUModelRunner that was dropped during rebase - Guard deepstack_num_level, mrope_section, mrope_interleaved with hasattr checks since xlite C++ ModelConfig may not have these attrs Co-Authored-By: Claude Opus 4.6 Signed-off-by: leo-pony --- vllm_ascend/worker/model_runner_v1.py | 5 +++++ vllm_ascend/xlite/xlite.py | 9 ++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 752d08771b0..021833791cf 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -279,6 +279,11 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.block_size = vllm_config.cache_config.block_size # Set up Attention self.use_sparse = hasattr(self.vllm_config.model_config.hf_text_config, "index_topk") + # dsa c8 + self.use_sparse_c8_indexer = self.ascend_config.enable_sparse_c8 + if self.use_sparse_c8_indexer: + self.c8_k_cache_dtype = torch.int8 + self.c8_k_scale_cache_dtype = torch.float16 from vllm_ascend.utils import vllm_version_is if vllm_version_is("0.17.0"): diff --git a/vllm_ascend/xlite/xlite.py b/vllm_ascend/xlite/xlite.py index ac3b1f9c7d4..64133235724 100644 --- a/vllm_ascend/xlite/xlite.py +++ b/vllm_ascend/xlite/xlite.py @@ -92,9 +92,12 @@ def _build_model_config(self, vllm_config: VllmConfig) -> ModelConfig: vision_config = getattr(vllm_config.model_config.hf_config, "vision_config", None) rope_parameters = getattr(hf_config, "rope_parameters", {}) - config.deepstack_num_level = len(getattr(vision_config, "deepstack_visual_indexes", [])) - config.mrope_section = rope_parameters.get("mrope_section", []) - config.mrope_interleaved = rope_parameters.get("mrope_interleaved", False) + if hasattr(config, "deepstack_num_level"): + config.deepstack_num_level = len(getattr(vision_config, "deepstack_visual_indexes", [])) + if hasattr(config, "mrope_section"): + config.mrope_section = rope_parameters.get("mrope_section", []) + if hasattr(config, "mrope_interleaved"): + config.mrope_interleaved = rope_parameters.get("mrope_interleaved", False) return config def _build_model(self, runnable: nn.Module, vllm_config: VllmConfig, config: ModelConfig) -> Model: From c60dadb74ed83408acf0f5ec7f7d367790da6699 Mon Sep 17 00:00:00 2001 From: leo-pony Date: Tue, 17 Mar 2026 06:52:23 +0000 Subject: [PATCH 05/15] upgrade to 3-17 afternoon Signed-off-by: leo-pony --- .github/workflows/bot_pr_create.yaml | 2 +- .github/workflows/dockerfiles/Dockerfile.lint | 2 +- .github/workflows/pr_test_full.yaml | 2 +- .github/workflows/pr_test_light.yaml | 6 +++--- .github/workflows/schedule_codecov_refresh.yaml | 2 +- docs/source/community/versioning_policy.md | 2 +- vllm_ascend/worker/v2/README.md | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml index 80a9bcbdd79..bbaba802678 100644 --- a/.github/workflows/bot_pr_create.yaml +++ b/.github/workflows/bot_pr_create.yaml @@ -37,7 +37,7 @@ jobs: steps: - name: Get vLLM version run: | - VLLM_COMMIT=43a73f853bac76e6c95c629e4aaa0858f610eb11 + VLLM_COMMIT=8a680463fab3bc9e6760417cd5c0a6aa58283065 echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV" - name: Checkout repository diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index 8ac69312305..277c874e978 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=43a73f853bac76e6c95c629e4aaa0858f610eb11 +ARG VLLM_COMMIT=8a680463fab3bc9e6760417cd5c0a6aa58283065 RUN git clone $VLLM_REPO /vllm-workspace/vllm && \ cd /vllm-workspace/vllm && \ git checkout $VLLM_COMMIT diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index faa3c9461e6..76b5cbc89c1 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [43a73f853bac76e6c95c629e4aaa0858f610eb11, v0.17.0] + vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 503ce49e708..e607b0c2abe 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: 43a73f853bac76e6c95c629e4aaa0858f610eb11 + vllm: 8a680463fab3bc9e6760417cd5c0a6aa58283065 changes: runs-on: linux-aarch64-a2b3-0 outputs: @@ -90,7 +90,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [43a73f853bac76e6c95c629e4aaa0858f610eb11, v0.17.0] + vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -102,7 +102,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [43a73f853bac76e6c95c629e4aaa0858f610eb11, v0.17.0] + vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml index da8e1e4f3f6..74864db0b53 100644 --- a/.github/workflows/schedule_codecov_refresh.yaml +++ b/.github/workflows/schedule_codecov_refresh.yaml @@ -33,7 +33,7 @@ jobs: name: refresh codecov strategy: matrix: - vllm_version: [43a73f853bac76e6c95c629e4aaa0858f610eb11] + vllm_version: [8a680463fab3bc9e6760417cd5c0a6aa58283065] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 3b3a91551cf..386c7064373 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | 43a73f853bac76e6c95c629e4aaa0858f610eb11, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | +| main | 8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | ## Release cadence diff --git a/vllm_ascend/worker/v2/README.md b/vllm_ascend/worker/v2/README.md index 9436945d6c5..22353e9ff00 100644 --- a/vllm_ascend/worker/v2/README.md +++ b/vllm_ascend/worker/v2/README.md @@ -5,5 +5,5 @@ This directory contains the new model runner which is under active development. please see [Model Runner V2](https://github.com/vllm-project/vllm-ascend/issues/5208) to get specific plans. -supported vllm version: main@43a73f853bac76e6c95c629e4aaa0858f610eb11 +supported vllm version: main@8a680463fab3bc9e6760417cd5c0a6aa58283065 related PR: From 4aee1ed05ce7d0dca3bcd52d8c4749e74f2fa574 Mon Sep 17 00:00:00 2001 From: leo-pony Date: Tue, 17 Mar 2026 09:12:56 +0000 Subject: [PATCH 06/15] fix: use get_eagle3_default_aux_hidden_state_layers after SupportsEagle3 refactor Upstream vLLM commit 8b346309 (Consolidate SupportsEagle #36063) renamed get_eagle3_aux_hidden_state_layers() to get_eagle3_default_aux_hidden_state_layers() and added a supports_eagle3() guard before calling it. Update model_runner_v1.py to match upstream: add supports_eagle3 check and use the new method name to fix AttributeError on Qwen3ForCausalLM. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: leo-pony --- vllm_ascend/worker/model_runner_v1.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 021833791cf..dfb56ac6e46 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2556,7 +2556,14 @@ def load_model(self) -> None: with get_tp_context(self.drafter): self.drafter.load_model(self.model) if self.use_aux_hidden_state_outputs: - self.model.set_aux_hidden_state_layers(self.model.get_eagle3_aux_hidden_state_layers()) + from vllm.model_executor.models.interfaces import supports_eagle3 + if not supports_eagle3(self.model): + raise RuntimeError( + "Model does not support EAGLE3 interface but " + "aux_hidden_state_outputs was requested" + ) + aux_layers = self.model.get_eagle3_default_aux_hidden_state_layers() + self.model.set_aux_hidden_state_layers(aux_layers) if self.lora_config: self.model = self.load_lora_model(self.model, self.vllm_config, self.device) From 45f97b348226de1003ebd8b86d4ec5229449615a Mon Sep 17 00:00:00 2001 From: leo-pony Date: Tue, 17 Mar 2026 09:41:37 +0000 Subject: [PATCH 07/15] fix: adapt NPUOffloadingSpec to upstream OffloadingSpec refactor Upstream vLLM commit cfaf4668 (Support multiple KV groups in OffloadingSpec #36610) removed self.offloaded_block_size and changed self.gpu_block_size from a scalar to a tuple of per-group block sizes, adding block_size_factor. Update NPUOffloadingSpec.get_manager() and get_handlers() to match the new API: extract gpu_block_size[0] and compute offloaded_block_size via gpu_block_size * block_size_factor. Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: leo-pony --- vllm_ascend/kv_offload/npu.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/kv_offload/npu.py b/vllm_ascend/kv_offload/npu.py index 211f3dcef8e..bd68ed16b27 100644 --- a/vllm_ascend/kv_offload/npu.py +++ b/vllm_ascend/kv_offload/npu.py @@ -33,8 +33,11 @@ def get_manager(self) -> OffloadingManager: if not self._manager: kv_events_config = self.vllm_config.kv_events_config enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events + assert len(self.gpu_block_size) == 1 + gpu_block_size = self.gpu_block_size[0] + offloaded_block_size = gpu_block_size * self.block_size_factor self._manager = LRUOffloadingManager( - CPUBackend(block_size=self.offloaded_block_size, num_blocks=self.num_cpu_blocks), + CPUBackend(block_size=offloaded_block_size, num_blocks=self.num_cpu_blocks), enable_events=enable_events, ) return self._manager @@ -45,10 +48,12 @@ def get_handlers( attn_backends: dict[str, type[AttentionBackend]], ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]: if not self._handler: + assert len(self.gpu_block_size) == 1 + gpu_block_size = self.gpu_block_size[0] self._handler = CpuNpuOffloadingHandler( attn_backends=attn_backends, - gpu_block_size=self.gpu_block_size, - cpu_block_size=self.offloaded_block_size, + gpu_block_size=gpu_block_size, + cpu_block_size=gpu_block_size * self.block_size_factor, num_cpu_blocks=self.num_cpu_blocks, gpu_caches=kv_caches, ) From 312867d65ba4412562efafa64dce8e98cde15d23 Mon Sep 17 00:00:00 2001 From: leo-pony Date: Tue, 17 Mar 2026 09:57:27 +0000 Subject: [PATCH 08/15] fix: restore sparse_head_dim initialization in NPUModelRunner The sparse_head_dim tuple (kv_lora_rank, qk_rope_head_dim, index_head_dim) was dropped during rebase but is required by get_kv_cache_spec() when use_sparse is True (DSv3.1 sparse MLA models). Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: leo-pony --- vllm_ascend/worker/model_runner_v1.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index dfb56ac6e46..485ce97dfa5 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -279,6 +279,12 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.block_size = vllm_config.cache_config.block_size # Set up Attention self.use_sparse = hasattr(self.vllm_config.model_config.hf_text_config, "index_topk") + if self.use_sparse: + self.sparse_head_dim = ( + self.model_config.hf_text_config.kv_lora_rank, + self.model_config.hf_text_config.qk_rope_head_dim, + self.model_config.hf_text_config.index_head_dim, + ) # dsa c8 self.use_sparse_c8_indexer = self.ascend_config.enable_sparse_c8 if self.use_sparse_c8_indexer: From bc8aecc4ad67ccc16ba92f51a0a6a2799e545ea3 Mon Sep 17 00:00:00 2001 From: leo-pony Date: Tue, 17 Mar 2026 14:28:03 +0000 Subject: [PATCH 09/15] restore continue on error to false Signed-off-by: leo-pony --- .github/workflows/_e2e_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index bc127c0f7a9..5404cfb0562 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -18,7 +18,7 @@ on: continue_on_error: required: false type: boolean - default: true + default: false env: UV_INDEX_URL: http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple UV_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi From 26637c822b1a2e49ed5c2e2eedcf1ac6d644884a Mon Sep 17 00:00:00 2001 From: leo-pony Date: Wed, 18 Mar 2026 03:56:40 +0000 Subject: [PATCH 10/15] offloaded_block_size and eagle3 aux hidden state fix compatible 0.17.0 handle Signed-off-by: leo-pony --- vllm_ascend/kv_offload/npu.py | 53 ++++++++++++++++++--------- vllm_ascend/worker/model_runner_v1.py | 19 ++++++---- 2 files changed, 46 insertions(+), 26 deletions(-) diff --git a/vllm_ascend/kv_offload/npu.py b/vllm_ascend/kv_offload/npu.py index bd68ed16b27..183ab9ed338 100644 --- a/vllm_ascend/kv_offload/npu.py +++ b/vllm_ascend/kv_offload/npu.py @@ -31,15 +31,23 @@ def __init__(self, vllm_config: VllmConfig, kv_cache_config: KVCacheConfig | Non def get_manager(self) -> OffloadingManager: if not self._manager: - kv_events_config = self.vllm_config.kv_events_config - enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events - assert len(self.gpu_block_size) == 1 - gpu_block_size = self.gpu_block_size[0] - offloaded_block_size = gpu_block_size * self.block_size_factor - self._manager = LRUOffloadingManager( - CPUBackend(block_size=offloaded_block_size, num_blocks=self.num_cpu_blocks), - enable_events=enable_events, - ) + if vllm_version_is("0.17.0"): + kv_events_config = self.vllm_config.kv_events_config + enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events + self._manager = LRUOffloadingManager( + CPUBackend(block_size=self.offloaded_block_size, num_blocks=self.num_cpu_blocks), + enable_events=enable_events, + ) + else: + kv_events_config = self.vllm_config.kv_events_config + enable_events = kv_events_config is not None and kv_events_config.enable_kv_cache_events + assert len(self.gpu_block_size) == 1 + gpu_block_size = self.gpu_block_size[0] + offloaded_block_size = gpu_block_size * self.block_size_factor + self._manager = LRUOffloadingManager( + CPUBackend(block_size=offloaded_block_size, num_blocks=self.num_cpu_blocks), + enable_events=enable_events, + ) return self._manager def get_handlers( @@ -48,15 +56,24 @@ def get_handlers( attn_backends: dict[str, type[AttentionBackend]], ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]: if not self._handler: - assert len(self.gpu_block_size) == 1 - gpu_block_size = self.gpu_block_size[0] - self._handler = CpuNpuOffloadingHandler( - attn_backends=attn_backends, - gpu_block_size=gpu_block_size, - cpu_block_size=gpu_block_size * self.block_size_factor, - num_cpu_blocks=self.num_cpu_blocks, - gpu_caches=kv_caches, - ) + if vllm_version_is("0.17.0"): + self._handler = CpuNpuOffloadingHandler( + attn_backends=attn_backends, + gpu_block_size=self.gpu_block_size, + cpu_block_size=self.offloaded_block_size, + num_cpu_blocks=self.num_cpu_blocks, + gpu_caches=kv_caches, + ) + else: + assert len(self.gpu_block_size) == 1 + gpu_block_size = self.gpu_block_size[0] + self._handler = CpuNpuOffloadingHandler( + attn_backends=attn_backends, + gpu_block_size=gpu_block_size, + cpu_block_size=gpu_block_size * self.block_size_factor, + num_cpu_blocks=self.num_cpu_blocks, + gpu_caches=kv_caches, + ) assert self._handler is not None yield GPULoadStoreSpec, CPULoadStoreSpec, self._handler diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 485ce97dfa5..e370f82b9be 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2562,14 +2562,17 @@ def load_model(self) -> None: with get_tp_context(self.drafter): self.drafter.load_model(self.model) if self.use_aux_hidden_state_outputs: - from vllm.model_executor.models.interfaces import supports_eagle3 - if not supports_eagle3(self.model): - raise RuntimeError( - "Model does not support EAGLE3 interface but " - "aux_hidden_state_outputs was requested" - ) - aux_layers = self.model.get_eagle3_default_aux_hidden_state_layers() - self.model.set_aux_hidden_state_layers(aux_layers) + if vllm_version_is("0.17.0"): + self.model.set_aux_hidden_state_layers(self.model.get_eagle3_aux_hidden_state_layers()) + else: + from vllm.model_executor.models.interfaces import supports_eagle3 + if not supports_eagle3(self.model): + raise RuntimeError( + "Model does not support EAGLE3 interface but " + "aux_hidden_state_outputs was requested" + ) + aux_layers = self.model.get_eagle3_default_aux_hidden_state_layers() + self.model.set_aux_hidden_state_layers(aux_layers) if self.lora_config: self.model = self.load_lora_model(self.model, self.vllm_config, self.device) From 95989f0cfb96c4653c84653c1910a620d49f5e3a Mon Sep 17 00:00:00 2001 From: leo-pony Date: Wed, 18 Mar 2026 04:13:44 +0000 Subject: [PATCH 11/15] CI format fix Signed-off-by: leo-pony --- tests/e2e/multicard/2-cards/test_disaggregated_encoder.py | 3 +++ vllm_ascend/kv_offload/npu.py | 1 + 2 files changed, 4 insertions(+) diff --git a/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py b/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py index ec5ca3a4daf..38486fc2256 100644 --- a/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py +++ b/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py @@ -31,6 +31,9 @@ @pytest.mark.asyncio @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) +@pytest.skip( + reason="EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408" +) async def test_models(model: str, tp_size: int) -> None: encode_port = get_open_port() pd_port = get_open_port() diff --git a/vllm_ascend/kv_offload/npu.py b/vllm_ascend/kv_offload/npu.py index 183ab9ed338..828df509e07 100644 --- a/vllm_ascend/kv_offload/npu.py +++ b/vllm_ascend/kv_offload/npu.py @@ -12,6 +12,7 @@ from vllm.v1.kv_offload.worker.worker import OffloadingHandler from vllm_ascend.kv_offload.cpu_npu import CpuNpuOffloadingHandler +from vllm_ascend.utils import vllm_version_is class NPUOffloadingSpec(OffloadingSpec): From cd9be41393fe0a53ea33078dcd8e32e1ff78db76 Mon Sep 17 00:00:00 2001 From: leo-pony Date: Wed, 18 Mar 2026 07:34:49 +0000 Subject: [PATCH 12/15] ci format fix Signed-off-by: leo-pony --- .../e2e/multicard/2-cards/test_disaggregated_encoder.py | 9 ++++++--- vllm_ascend/ascend_config.py | 1 - vllm_ascend/patch/platform/patch_torch_accelerator.py | 8 ++++++++ vllm_ascend/platform.py | 9 --------- vllm_ascend/worker/model_runner_v1.py | 4 +++- 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py b/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py index 38486fc2256..2c173cadb37 100644 --- a/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py +++ b/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py @@ -31,10 +31,13 @@ @pytest.mark.asyncio @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) -@pytest.skip( - reason="EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408" -) async def test_models(model: str, tp_size: int) -> None: + from vllm_ascend.utils import vllm_version_is + + if vllm_version_is("0.17.0"): + pytest.skip( + "EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408", + ) encode_port = get_open_port() pd_port = get_open_port() vllm_server_args = [ diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index b5ff1ec2d44..ebd6c5aa09e 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -219,7 +219,6 @@ def update_compile_ranges_split_points(self): if vllm_config.additional_config.get("ascend_compilation_config", {}).get("fuse_allreduce_rms", True): from vllm_ascend.compilation.passes.allreduce_rmsnorm_fusion_pass import ALLREDUCE_NORM_FUSE_THRESHOLD - new_compile_ranges_split_points = self._get_compile_ranges(vllm_config.compilation_config) new_compile_ranges_split_points.append(ALLREDUCE_NORM_FUSE_THRESHOLD) new_compile_ranges_split_points = sorted(new_compile_ranges_split_points) self._set_compile_ranges(vllm_config.compilation_config, new_compile_ranges_split_points) diff --git a/vllm_ascend/patch/platform/patch_torch_accelerator.py b/vllm_ascend/patch/platform/patch_torch_accelerator.py index 431dce4e51b..43bf4c109a6 100644 --- a/vllm_ascend/patch/platform/patch_torch_accelerator.py +++ b/vllm_ascend/patch/platform/patch_torch_accelerator.py @@ -6,3 +6,11 @@ def patch_empty_cache() -> None: torch.accelerator.empty_cache = patch_empty_cache + +# Monkey-patch torch.accelerator memory APIs for NPU compatibility. +# Upstream vLLM (commit 747b068) replaced current_platform.memory_stats() +# with torch.accelerator.memory_stats(), but torch.accelerator does not +# properly delegate to NPU. We redirect to torch.npu.* equivalents. +torch.accelerator.memory_stats = torch.npu.memory_stats # type: ignore[attr-defined] +torch.accelerator.memory_reserved = torch.npu.memory_reserved # type: ignore[attr-defined] +torch.accelerator.reset_peak_memory_stats = torch.npu.reset_peak_memory_stats # type: ignore[attr-defined] diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 25333345914..55e1408ca2f 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -27,15 +27,6 @@ from vllm.logger import logger from vllm.platforms import Platform, PlatformEnum -# Monkey-patch torch.accelerator memory APIs for NPU compatibility. -# Upstream vLLM (commit 747b068) replaced current_platform.memory_stats() -# with torch.accelerator.memory_stats(), but torch.accelerator does not -# properly delegate to NPU. We redirect to torch.npu.* equivalents. -if hasattr(torch, "npu"): - torch.accelerator.memory_stats = torch.npu.memory_stats # type: ignore[attr-defined] - torch.accelerator.memory_reserved = torch.npu.memory_reserved # type: ignore[attr-defined] - torch.accelerator.reset_peak_memory_stats = torch.npu.reset_peak_memory_stats # type: ignore[attr-defined] - # todo: please remove it when solve cuda hard code in vllm os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "1" diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index e370f82b9be..1bf7095e41e 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -278,7 +278,9 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.is_multimodal_model = self.model_config.is_multimodal_model self.block_size = vllm_config.cache_config.block_size # Set up Attention - self.use_sparse = hasattr(self.vllm_config.model_config.hf_text_config, "index_topk") + self.use_sparse = hasattr(vllm_config.model_config, "hf_text_config") and hasattr( + vllm_config.model_config.hf_text_config, "index_topk" + ) if self.use_sparse: self.sparse_head_dim = ( self.model_config.hf_text_config.kv_lora_rank, From a98f62b00737c10a76577907c6b232c37da18763 Mon Sep 17 00:00:00 2001 From: leo-pony Date: Wed, 18 Mar 2026 10:59:51 +0000 Subject: [PATCH 13/15] fix skip eplb error Signed-off-by: leo-pony --- docs/source/community/versioning_policy.md | 2 +- tests/e2e/multicard/2-cards/test_disaggregated_encoder.py | 2 +- tests/e2e/multicard/2-cards/test_qwen3_moe.py | 6 ++++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 386c7064373..9bab96b970b 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | 8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | +| main | 4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | ## Release cadence diff --git a/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py b/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py index 2c173cadb37..6635f491997 100644 --- a/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py +++ b/tests/e2e/multicard/2-cards/test_disaggregated_encoder.py @@ -34,7 +34,7 @@ async def test_models(model: str, tp_size: int) -> None: from vllm_ascend.utils import vllm_version_is - if vllm_version_is("0.17.0"): + if not vllm_version_is("0.17.0"): pytest.skip( "EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408", ) diff --git a/tests/e2e/multicard/2-cards/test_qwen3_moe.py b/tests/e2e/multicard/2-cards/test_qwen3_moe.py index 385b32e83cc..4ce5e33e681 100644 --- a/tests/e2e/multicard/2-cards/test_qwen3_moe.py +++ b/tests/e2e/multicard/2-cards/test_qwen3_moe.py @@ -76,6 +76,12 @@ def test_qwen3_moe_distributed_aiv_tp2(): @pytest.mark.asyncio async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb(): + from vllm_ascend.utils import vllm_version_is + + if not vllm_version_is("0.17.0"): + pytest.skip( + "EPLB output is different without EPLB, see issue: https://github.com/vllm-project/vllm-ascend/issues/7408", + ) model = "vllm-ascend/Qwen3-30B-A3B-W8A8" port = get_open_port() compilation_config = json.dumps({"cudagraph_capture_sizes": [8]}) From 305798d564f1e8bc4ac85e975f17799ee6ef8ea8 Mon Sep 17 00:00:00 2001 From: leo-pony Date: Wed, 18 Mar 2026 11:38:14 +0000 Subject: [PATCH 14/15] fix commit id error for worker v2 Signed-off-by: leo-pony --- docs/source/community/versioning_policy.md | 2 +- vllm_ascend/worker/v2/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 9bab96b970b..386c7064373 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | 4497431df654e46fb1fb5e64bf8611e762ae5d87, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | +| main | 8a680463fab3bc9e6760417cd5c0a6aa58283065, v0.17.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | ## Release cadence diff --git a/vllm_ascend/worker/v2/README.md b/vllm_ascend/worker/v2/README.md index 22353e9ff00..734ad141cc5 100644 --- a/vllm_ascend/worker/v2/README.md +++ b/vllm_ascend/worker/v2/README.md @@ -5,5 +5,5 @@ This directory contains the new model runner which is under active development. please see [Model Runner V2](https://github.com/vllm-project/vllm-ascend/issues/5208) to get specific plans. -supported vllm version: main@8a680463fab3bc9e6760417cd5c0a6aa58283065 +supported vllm version: main@4497431df654e46fb1fb5e64bf8611e762ae5d87 related PR: From b155f1849c27b8cb96ebbe26b222c05b5d16d0ed Mon Sep 17 00:00:00 2001 From: leo-pony Date: Wed, 18 Mar 2026 11:53:36 +0000 Subject: [PATCH 15/15] fix commit id error for worker v2 readme Signed-off-by: leo-pony --- vllm_ascend/worker/v2/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/worker/v2/README.md b/vllm_ascend/worker/v2/README.md index 734ad141cc5..1c1309e6a48 100644 --- a/vllm_ascend/worker/v2/README.md +++ b/vllm_ascend/worker/v2/README.md @@ -5,5 +5,5 @@ This directory contains the new model runner which is under active development. please see [Model Runner V2](https://github.com/vllm-project/vllm-ascend/issues/5208) to get specific plans. -supported vllm version: main@4497431df654e46fb1fb5e64bf8611e762ae5d87 +supported vllm version: main@4034c3d32e30d01639459edd3ab486f56993876d related PR: