From 4c69c5f002a219b79404b2dbf672efa4dbfb544f Mon Sep 17 00:00:00 2001 From: wjunLu Date: Thu, 15 Jan 2026 10:35:08 +0800 Subject: [PATCH] [Main2Main] Upgrade vllm commit to 0116 Signed-off-by: wjunLu --- .github/workflows/bot_pr_create.yaml | 2 +- .github/workflows/pr_test_full.yaml | 2 +- .github/workflows/pr_test_light.yaml | 6 +++--- .github/workflows/schedule_codecov_refresh.yaml | 2 +- docs/source/community/versioning_policy.md | 2 +- vllm_ascend/attention/mla_v1.py | 6 ++++-- vllm_ascend/attention/sfa_v1.py | 5 +++-- vllm_ascend/patch/worker/patch_v2_egale.py | 2 +- vllm_ascend/worker/v2/aclgraph_utils.py | 7 ++++++- vllm_ascend/worker/v2/attn_utils.py | 7 ++++++- vllm_ascend/worker/v2/sample/penalties.py | 2 +- vllm_ascend/worker/v2/sample/sampler.py | 2 +- 12 files changed, 29 insertions(+), 16 deletions(-) diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml index 88a6e7c7917..0d055d31415 100644 --- a/.github/workflows/bot_pr_create.yaml +++ b/.github/workflows/bot_pr_create.yaml @@ -37,7 +37,7 @@ jobs: steps: - name: Get vLLM version run: | - VLLM_COMMIT=2c24bc6996cb165fce92f780b388a5e39b3f4060 + VLLM_COMMIT=46f8a982b191e3a3d3a1eccaf18b184c391ac2ac echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV - name: Checkout repository diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 39c9f584e6c..ad7aebca533 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [2c24bc6996cb165fce92f780b388a5e39b3f4060, v0.13.0] + vllm_version: [46f8a982b191e3a3d3a1eccaf18b184c391ac2ac, v0.13.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 5469a89730e..7226e38cb76 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: 2c24bc6996cb165fce92f780b388a5e39b3f4060 + vllm: 46f8a982b191e3a3d3a1eccaf18b184c391ac2ac changes: runs-on: linux-aarch64-a2-0 outputs: @@ -84,7 +84,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [2c24bc6996cb165fce92f780b388a5e39b3f4060, v0.13.0] + vllm_version: [46f8a982b191e3a3d3a1eccaf18b184c391ac2ac, v0.13.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -96,7 +96,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [2c24bc6996cb165fce92f780b388a5e39b3f4060, v0.13.0] + vllm_version: [46f8a982b191e3a3d3a1eccaf18b184c391ac2ac, v0.13.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml index c0c9b284451..e34bf928ec1 100644 --- a/.github/workflows/schedule_codecov_refresh.yaml +++ b/.github/workflows/schedule_codecov_refresh.yaml @@ -33,7 +33,7 @@ jobs: name: refresh codecov strategy: matrix: - vllm_version: [2c24bc6996cb165fce92f780b388a5e39b3f4060] + vllm_version: [46f8a982b191e3a3d3a1eccaf18b184c391ac2ac] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index a6d795231b4..44f74cdfd17 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -53,7 +53,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | 2c24bc6996cb165fce92f780b388a5e39b3f4060, v0.13.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 | +| main | 46f8a982b191e3a3d3a1eccaf18b184c391ac2ac, v0.13.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 | ## Release cadence diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 74dbe7da590..dd18b02e2d5 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -10,7 +10,6 @@ from vllm.logger import logger from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.utils.math_utils import cdiv, round_down -from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder from vllm.v1.kv_cache_interface import AttentionSpec, MLAAttentionSpec from vllm_ascend import envs @@ -39,16 +38,19 @@ vllm_version_is, weak_ref_tensors) from vllm_ascend.worker.npu_input_batch import NPUInputBatch + if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput # isort: off if vllm_version_is('0.13.0'): - from vllm.v1.attention.backends.utils import AttentionCGSupport + from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder # type: ignore + from vllm.v1.attention.backends.utils import AttentionCGSupport # type: ignore from vllm.attention.backends.abstract import ( # type: ignore AttentionBackend, MLAAttentionImpl) from vllm.attention.backends.utils import PAD_SLOT_ID # type: ignore else: + from vllm.model_executor.layers.attention.mla_attention import MLACommonMetadataBuilder from vllm.v1.attention.backend import ( # type: ignore AttentionBackend, AttentionCGSupport, MLAAttentionImpl) from vllm.v1.attention.backends.utils import PAD_SLOT_ID # type: ignore diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py index 0fd62499a01..2b6dd3c7c79 100644 --- a/vllm_ascend/attention/sfa_v1.py +++ b/vllm_ascend/attention/sfa_v1.py @@ -11,7 +11,6 @@ from vllm.logger import logger from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.triton_utils import HAS_TRITON -from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder from vllm.v1.kv_cache_interface import AttentionSpec from vllm_ascend import envs @@ -39,10 +38,12 @@ if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput if vllm_version_is('0.13.0'): - from vllm.v1.attention.backends.utils import AttentionCGSupport + from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder # type: ignore + from vllm.v1.attention.backends.utils import AttentionCGSupport # type: ignore from vllm.attention.backends.abstract import ( # type: ignore AttentionBackend, MLAAttentionImpl) else: + from vllm.model_executor.layers.attention.mla_attention import MLACommonMetadataBuilder from vllm.v1.attention.backend import ( # type: ignore AttentionBackend, AttentionCGSupport, MLAAttentionImpl) # isort: on diff --git a/vllm_ascend/patch/worker/patch_v2_egale.py b/vllm_ascend/patch/worker/patch_v2_egale.py index 108df8cc2a9..24470e63361 100644 --- a/vllm_ascend/patch/worker/patch_v2_egale.py +++ b/vllm_ascend/patch/worker/patch_v2_egale.py @@ -21,7 +21,7 @@ import vllm from vllm.v1.worker.gpu.input_batch import InputBatch from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample -from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata +from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.worker.gpu.spec_decode.eagle import (prepare_eagle_decode, prepare_eagle_inputs) diff --git a/vllm_ascend/worker/v2/aclgraph_utils.py b/vllm_ascend/worker/v2/aclgraph_utils.py index 1fab82d246d..922755fc1d5 100644 --- a/vllm_ascend/worker/v2/aclgraph_utils.py +++ b/vllm_ascend/worker/v2/aclgraph_utils.py @@ -22,7 +22,6 @@ import torch import torch.nn as nn from vllm.config import VllmConfig -from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.worker.gpu.block_table import BlockTables from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager @@ -31,6 +30,12 @@ from vllm.v1.worker.gpu.input_batch import InputBuffers from vllm_ascend.worker.v2.utils import torch_cuda_wrapper +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is('0.13.0'): + from vllm.v1.attention.backends.utils import AttentionMetadataBuilder +else: + from vllm.v1.attention.backend import AttentionMetadataBuilder class AclGraphManager(CudaGraphManager): diff --git a/vllm_ascend/worker/v2/attn_utils.py b/vllm_ascend/worker/v2/attn_utils.py index e8ed5a28445..b3c7fd903d6 100644 --- a/vllm_ascend/worker/v2/attn_utils.py +++ b/vllm_ascend/worker/v2/attn_utils.py @@ -23,13 +23,18 @@ import numpy as np import torch from vllm.config import VllmConfig -from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.kv_cache_interface import EncoderOnlyAttentionSpec, KVCacheConfig from vllm_ascend.attention.attention_mask import AttentionMaskBuilder from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata, AscendPrefillContextParallelMetadata) +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is('0.13.0'): + from vllm.v1.attention.backends.utils import AttentionMetadataBuilder +else: + from vllm.v1.attention.backend import AttentionMetadataBuilder _ATTENTION_MASK_BUILDER = None diff --git a/vllm_ascend/worker/v2/sample/penalties.py b/vllm_ascend/worker/v2/sample/penalties.py index fe730a9dd3c..c655ad46357 100644 --- a/vllm_ascend/worker/v2/sample/penalties.py +++ b/vllm_ascend/worker/v2/sample/penalties.py @@ -20,7 +20,7 @@ import torch from vllm.triton_utils import tl, triton -from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata +from vllm.v1.sample.metadata import SamplingMetadata @triton.jit diff --git a/vllm_ascend/worker/v2/sample/sampler.py b/vllm_ascend/worker/v2/sample/sampler.py index e54536c7a0b..8989363b901 100644 --- a/vllm_ascend/worker/v2/sample/sampler.py +++ b/vllm_ascend/worker/v2/sample/sampler.py @@ -17,7 +17,7 @@ import torch from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p -from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata +from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.worker.gpu.sample.min_p import apply_min_p from vllm.v1.worker.gpu.sample.sampler import Sampler