diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index 44db2b46a09..16d48b9cf86 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -32,7 +32,7 @@ on: description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need vllm_version: required: false - default: "v0.15.0" + default: "v0.16.0" type: string description: vllm version to use vllm_ascend_remote_url: diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml index 51e52bed940..b03682959a3 100644 --- a/.github/workflows/bot_pr_create.yaml +++ b/.github/workflows/bot_pr_create.yaml @@ -37,7 +37,7 @@ jobs: steps: - name: Get vLLM version run: | - VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd + VLLM_COMMIT=b3c14229b032a8bbf93d450a52c9a404ddaea429 echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV" - name: Checkout repository diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index c86324ae6b2..f56b51ce1b3 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd +ARG VLLM_COMMIT=b3c14229b032a8bbf93d450a52c9a404ddaea429 RUN git clone $VLLM_REPO /vllm-workspace/vllm && \ cd /vllm-workspace/vllm && \ git checkout $VLLM_COMMIT diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 2275bfb043b..6093cf24651 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0] + vllm_version: [b3c14229b032a8bbf93d450a52c9a404ddaea429, v0.16.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 4049953451d..36b1ef4ccb7 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: 83b47f67b1dfad505606070ae4d9f83e50ad4ebd + vllm: b3c14229b032a8bbf93d450a52c9a404ddaea429 changes: runs-on: linux-aarch64-a2b3-0 outputs: @@ -87,7 +87,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0] + vllm_version: [b3c14229b032a8bbf93d450a52c9a404ddaea429, v0.16.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -99,7 +99,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0] + vllm_version: [b3c14229b032a8bbf93d450a52c9a404ddaea429, v0.16.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml index ea7f97eac0b..5c729732aa5 100644 --- a/.github/workflows/schedule_codecov_refresh.yaml +++ b/.github/workflows/schedule_codecov_refresh.yaml @@ -33,7 +33,7 @@ jobs: name: refresh codecov strategy: matrix: - vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd] + vllm_version: [b3c14229b032a8bbf93d450a52c9a404ddaea429] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} diff --git a/.github/workflows/schedule_nightly_test_a2.yaml b/.github/workflows/schedule_nightly_test_a2.yaml index 9a011af62a2..3e555557d57 100644 --- a/.github/workflows/schedule_nightly_test_a2.yaml +++ b/.github/workflows/schedule_nightly_test_a2.yaml @@ -133,7 +133,7 @@ jobs: - Qwen3-Omni-30B-A3B-Instruct uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml with: - vllm: v0.15.0 + vllm: v0.16.0 runner: ${{ matrix.test_config.os }} model_list: ${{ toJson(matrix.test_config.model_list) }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-910b-ubuntu22.04-py3.11' diff --git a/.github/workflows/schedule_test_benchmarks.yaml b/.github/workflows/schedule_test_benchmarks.yaml index 4189ff9e84c..48ee1d14d18 100644 --- a/.github/workflows/schedule_test_benchmarks.yaml +++ b/.github/workflows/schedule_test_benchmarks.yaml @@ -51,7 +51,7 @@ jobs: strategy: matrix: include: - - vllm_branch: v0.15.0 + - vllm_branch: v0.16.0 vllm_ascend_branch: main max-parallel: 1 container: diff --git a/Dockerfile b/Dockerfile index a28ac7c27fa..b04ae9e1373 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,7 +50,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.15.0 +ARG VLLM_TAG=v0.16.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.310p b/Dockerfile.310p index e967d62b958..ec5772a2dd9 100644 --- a/Dockerfile.310p +++ b/Dockerfile.310p @@ -40,7 +40,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.15.0 +ARG VLLM_TAG=v0.16.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler index b5d71af032a..92f941996e9 100644 --- a/Dockerfile.310p.openEuler +++ b/Dockerfile.310p.openEuler @@ -36,7 +36,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.15.0 +ARG VLLM_TAG=v0.16.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.a3 b/Dockerfile.a3 index fd68662e6d8..17bd077b744 100644 --- a/Dockerfile.a3 +++ b/Dockerfile.a3 @@ -49,7 +49,7 @@ RUN apt-get update -y && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.15.0 +ARG VLLM_TAG=v0.16.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler index 1636322f498..4e5b3838a7b 100644 --- a/Dockerfile.a3.openEuler +++ b/Dockerfile.a3.openEuler @@ -50,7 +50,7 @@ RUN yum update -y && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.15.0 +ARG VLLM_TAG=v0.16.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index 6ff7037751e..825b8a2461d 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -50,7 +50,7 @@ RUN yum update -y && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.15.0 +ARG VLLM_TAG=v0.16.0 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index a5b8e39bdd0..a77a4f356bc 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -56,7 +56,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | 83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | +| main | b3c14229b032a8bbf93d450a52c9a404ddaea429, v0.16.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | ## Release cadence diff --git a/docs/source/conf.py b/docs/source/conf.py index cab35bbac68..37199108971 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -77,7 +77,7 @@ # CANN image tag "cann_image_tag": "8.5.0-910b-ubuntu22.04-py3.11", # vllm version in ci - "ci_vllm_version": "v0.15.0", + "ci_vllm_version": "v0.16.0", } # For cross-file header anchors diff --git a/tests/e2e/singlecard/compile/backend.py b/tests/e2e/singlecard/compile/backend.py index e0fde30cbba..2866f7f094d 100644 --- a/tests/e2e/singlecard/compile/backend.py +++ b/tests/e2e/singlecard/compile/backend.py @@ -20,15 +20,10 @@ import torch.fx as fx from torch._inductor.decomposition import select_decomp_table +from vllm.compilation.passes.fx_utils import OpOverload from vllm.config import get_current_vllm_config from vllm_ascend.compilation.compiler_interface import compile_fx -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.15.0"): - from vllm.compilation.fx_utils import OpOverload # type: ignore -else: - from vllm.compilation.passes.fx_utils import OpOverload class TestBackend: diff --git a/tests/e2e/singlecard/compile/test_norm_quant_fusion.py b/tests/e2e/singlecard/compile/test_norm_quant_fusion.py index 00b2b123aa5..8e12d276855 100644 --- a/tests/e2e/singlecard/compile/test_norm_quant_fusion.py +++ b/tests/e2e/singlecard/compile/test_norm_quant_fusion.py @@ -19,6 +19,7 @@ import torch import torch.nn as nn import vllm.config +from vllm.compilation.passes.fx_utils import OpOverload from vllm.config import ModelConfig, VllmConfig from vllm.distributed import ensure_model_parallel_initialized, init_distributed_environment from vllm.utils.system_utils import update_environment_variables @@ -27,13 +28,7 @@ from tests.e2e.singlecard.compile.backend import TestBackend from vllm_ascend.ascend_forward_context import set_ascend_forward_context from vllm_ascend.compilation.passes.norm_quant_fusion_pass import AddRMSNormQuantFusionPass -from vllm_ascend.utils import enable_custom_op, vllm_version_is - -if vllm_version_is("0.15.0"): - from vllm.compilation.fx_utils import OpOverload # type: ignore -else: - from vllm.compilation.passes.fx_utils import OpOverload - +from vllm_ascend.utils import enable_custom_op # Cache backend to avoid duplicate pattern registration _backend_cache = None diff --git a/tests/ut/eplb/core/test_eplb_utils.py b/tests/ut/eplb/core/test_eplb_utils.py index df49283bff9..1265ddba7b7 100644 --- a/tests/ut/eplb/core/test_eplb_utils.py +++ b/tests/ut/eplb/core/test_eplb_utils.py @@ -22,9 +22,9 @@ def setUp(self, mock_fix_incompatible_config): "eplb_config": {"dynamic_eplb": True, "num_redundant_experts": 2}, } from vllm.model_executor.layers.fused_moe.config import RoutingMethodType - if vllm_version_is("0.15.0"): + if vllm_version_is("0.16.0"): moe_parallel_config = FusedMoEParallelConfig( - 2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True) + 2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", is_sequence_parallel=True, enable_eplb=True) moe_config = FusedMoEConfig( num_experts=8, experts_per_token=8, diff --git a/tests/ut/quantization/test_modelslim_config.py b/tests/ut/quantization/test_modelslim_config.py index 2a9e0215b96..6176d3063c6 100644 --- a/tests/ut/quantization/test_modelslim_config.py +++ b/tests/ut/quantization/test_modelslim_config.py @@ -9,10 +9,7 @@ from vllm_ascend.quantization.modelslim_config import AscendModelSlimConfig from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is -if vllm_version_is("v0.15.0"): - from vllm.attention.layer import Attention # type: ignore -else: - from vllm.model_executor.layers.attention import Attention +from vllm.model_executor.layers.attention import Attention class TestAscendModelSlimConfig(TestBase): diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index dcd53535dc0..1403f5beec3 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -19,7 +19,6 @@ is_drafter_moe_model, is_moe_model, speculative_enable_dispatch_gmm_combine_decode, - vllm_version_is, ) @@ -152,10 +151,6 @@ def set_ascend_forward_context( mc2_mask[:num_actual_tokens] = True mc2_mask[num_actual_tokens:] = False forward_context.mc2_mask = mc2_mask - - if is_draft_model and vllm_version_is("0.15.0"): - forward_context.remaining_moe_layers = None - try: yield finally: diff --git a/vllm_ascend/compilation/graph_fusion_pass_manager.py b/vllm_ascend/compilation/graph_fusion_pass_manager.py index 6ec6b1d0d38..3c6226a4841 100644 --- a/vllm_ascend/compilation/graph_fusion_pass_manager.py +++ b/vllm_ascend/compilation/graph_fusion_pass_manager.py @@ -17,17 +17,10 @@ # from torch import fx as fx +from vllm.compilation.passes.inductor_pass import get_pass_context +from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass from vllm.config import VllmConfig -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.15.0"): - from vllm.compilation.inductor_pass import get_pass_context # type: ignore - from vllm.compilation.vllm_inductor_pass import VllmInductorPass # type: ignore -else: - from vllm.compilation.passes.inductor_pass import get_pass_context - from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass - class GraphFusionPassManager: """ diff --git a/vllm_ascend/compilation/passes/allreduce_rmsnorm_fusion_pass.py b/vllm_ascend/compilation/passes/allreduce_rmsnorm_fusion_pass.py index 525543e0b5b..5fc4005816e 100644 --- a/vllm_ascend/compilation/passes/allreduce_rmsnorm_fusion_pass.py +++ b/vllm_ascend/compilation/passes/allreduce_rmsnorm_fusion_pass.py @@ -16,6 +16,8 @@ # import torch from torch._inductor.pattern_matcher import Match, PatternMatcherPass, PatternPrettyPrinter +from vllm.compilation.passes.inductor_pass import get_pass_context +from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass from vllm.config import VllmConfig from vllm.config.compilation import Range from vllm.distributed import get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce @@ -24,14 +26,6 @@ from vllm_ascend.compilation.passes.base_pattern import BasePattern from vllm_ascend.compilation.passes.utils.npugraph_ex_utils_check import extra_stream_scope_check -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("0.15.0"): - from vllm.compilation.inductor_pass import get_pass_context # type: ignore - from vllm.compilation.vllm_inductor_pass import VllmInductorPass # type: ignore -else: - from vllm.compilation.passes.inductor_pass import get_pass_context - from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass # computation-communication tiling block is 512 ALLREDUCE_NORM_FUSE_THRESHOLD = 512 diff --git a/vllm_ascend/compilation/passes/norm_quant_fusion_pass.py b/vllm_ascend/compilation/passes/norm_quant_fusion_pass.py index b1f337597db..0c6fbde9516 100644 --- a/vllm_ascend/compilation/passes/norm_quant_fusion_pass.py +++ b/vllm_ascend/compilation/passes/norm_quant_fusion_pass.py @@ -17,17 +17,13 @@ # import torch from torch._inductor.pattern_matcher import PatternMatcherPass +from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass from vllm.config import VllmConfig from vllm.config.compilation import Range from vllm.logger import logger from vllm_ascend.compilation.passes.base_pattern import BasePattern -from vllm_ascend.utils import enable_custom_op, vllm_version_is - -if vllm_version_is("0.15.0"): - from vllm.compilation.vllm_inductor_pass import VllmInductorPass # type: ignore -else: - from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass +from vllm_ascend.utils import enable_custom_op class AddRMSNormQuantPattern(BasePattern): diff --git a/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py b/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py index f7dd2832796..31b0c6f4579 100644 --- a/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py +++ b/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py @@ -17,19 +17,13 @@ # import torch from torch._inductor.pattern_matcher import PatternMatcherPass, PatternPrettyPrinter +from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.config.compilation import Range from vllm.logger import logger +from vllm.model_executor.layers.attention import Attention from vllm_ascend.compilation.passes.base_pattern import BasePattern -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("v0.15.0"): - from vllm.attention.layer import Attention # type: ignore - from vllm.compilation.vllm_inductor_pass import VllmInductorPass # type: ignore -else: - from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass - from vllm.model_executor.layers.attention import Attention class QKNormRopeFusionPattern(BasePattern): diff --git a/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py b/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py index 614372dad87..481ff73c344 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py +++ b/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py @@ -26,7 +26,6 @@ MetadataServerProc, MLAConfig, ) -from vllm_ascend.utils import vllm_version_is if TYPE_CHECKING: from vllm.forward_context import ForwardContext @@ -35,10 +34,7 @@ from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request -if vllm_version_is("v0.15.0"): - from vllm.attention.layer import Attention, MLAAttention # type: ignore -else: - from vllm.model_executor.layers.attention import Attention, MLAAttention +from vllm.model_executor.layers.attention import Attention, MLAAttention @dataclass diff --git a/vllm_ascend/kv_offload/cpu_npu.py b/vllm_ascend/kv_offload/cpu_npu.py index 98a4d892101..6932ac53d49 100644 --- a/vllm_ascend/kv_offload/cpu_npu.py +++ b/vllm_ascend/kv_offload/cpu_npu.py @@ -6,8 +6,6 @@ from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.worker.worker import OffloadingHandler, TransferResult, TransferSpec -from vllm_ascend.utils import vllm_version_is - logger = init_logger(__name__) @@ -155,30 +153,22 @@ def transfer_async(self, job_id: int, spec: TransferSpec) -> bool: def get_finished(self) -> list[TransferResult]: results: list[TransferResult] = [] - if vllm_version_is("v0.15.0"): - for job_id, event in self.transfer_events.items(): - if event.query(): - results.append((job_id, True)) - self.events_pool.append(event) - for job_id, _ in results: - del self.transfer_events[job_id] - else: - finished_job_ids = [] - for job_id, event in self.transfer_events.items(): - if event.query(): - results.append( - TransferResult( - job_id=job_id, - success=True, - transfer_size=None, - transfer_time=None, - transfer_type=None, - ) + finished_job_ids = [] + for job_id, event in self.transfer_events.items(): + if event.query(): + results.append( + TransferResult( + job_id=job_id, + success=True, + transfer_size=None, + transfer_time=None, + transfer_type=None, ) - finished_job_ids.append(job_id) - self.events_pool.append(event) - for job_id in finished_job_ids: - del self.transfer_events[job_id] + ) + finished_job_ids.append(job_id) + self.events_pool.append(event) + for job_id in finished_job_ids: + del self.transfer_events[job_id] return results def wait(self, job_ids: set[int]) -> None: diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index e2300a07035..1fdf4d68343 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -30,7 +30,7 @@ from vllm_ascend.utils import vllm_version_is -if not vllm_version_is("0.15.0"): +if not vllm_version_is("0.16.0"): from vllm.model_executor.layers.fused_moe.fused_moe_method_base import FusedMoEMethodBase # type: ignore from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter # type: ignore from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner # type: ignore @@ -161,7 +161,7 @@ def apply( return final_hidden_states -if not vllm_version_is("0.15.0"): +if not vllm_version_is("0.16.0"): # Please remove this inheritance after extending vllm, todo(wxs) class AscendMoERunner(DefaultMoERunner): """ @@ -315,10 +315,10 @@ def __init__(self, *args, **kwargs): setup_moe_comm_method(self.moe_config) self.quant_type = self._get_quant_type() - if not vllm_version_is("0.15.0"): + if not vllm_version_is("0.16.0"): self.runner = self._init_runner() - if not vllm_version_is("0.15.0"): + if not vllm_version_is("0.16.0"): def _init_runner(self): # Storing the runner in the FusedMoE is an intermediate state, eventually @@ -364,7 +364,7 @@ def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tens """ return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(final_hidden_states) - if not vllm_version_is("0.15.0"): + if not vllm_version_is("0.16.0"): def forward( self, @@ -511,8 +511,7 @@ def __init__( ): AscendFusedMoE.__init__(self, **kwargs) - if not vllm_version_is("0.15.0"): - self._routed_input_transform = routed_input_transform + self._routed_input_transform = routed_input_transform self._shared_experts = shared_experts self.use_overlapped = use_overlapped self.shared_expert_stream = None @@ -525,7 +524,7 @@ def __init__( logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.") self._gate = gate - if not vllm_version_is("0.15.0"): + if not vllm_version_is("0.16.0"): # Recreate the runner with the correct shared_experts parameter # The parent class created the runner before self._shared_experts was set self.runner = self._init_runner() diff --git a/vllm_ascend/ops/mla.py b/vllm_ascend/ops/mla.py index 6f02cecdc38..3f7d66020c9 100644 --- a/vllm_ascend/ops/mla.py +++ b/vllm_ascend/ops/mla.py @@ -25,18 +25,13 @@ from vllm.config import CacheConfig, get_current_vllm_config from vllm.distributed import get_tensor_model_parallel_world_size from vllm.forward_context import ForwardContext, get_forward_context +from vllm.model_executor.layers.attention import MLAAttention from vllm.model_executor.layers.mla import MLAModules, MultiHeadLatentAttentionWrapper from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.utils.torch_utils import direct_register_custom_op from vllm.v1.attention.backend import AttentionMetadata # type: ignore from vllm_ascend.ascend_config import get_ascend_config -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("v0.15.0"): - from vllm.attention.layer import MLAAttention # type: ignore -else: - from vllm.model_executor.layers.attention import MLAAttention class IndexerWrapper(nn.Module): @@ -126,17 +121,16 @@ def __init__( o_proj=mla_modules.o_proj, ) - if not vllm_version_is("v0.15.0"): - original_process_weights = self.mla_attn.process_weights_after_loading + original_process_weights = self.mla_attn.process_weights_after_loading - def wrapped_process_weights(act_dtype: torch.dtype): - from vllm_ascend.attention.sfa_v1 import AscendSFAImpl + def wrapped_process_weights(act_dtype: torch.dtype): + from vllm_ascend.attention.sfa_v1 import AscendSFAImpl - if not isinstance(self.mla_attn.impl, AscendSFAImpl): - original_process_weights(act_dtype) - self.mla_attn.impl.process_weights_after_loading(act_dtype) + if not isinstance(self.mla_attn.impl, AscendSFAImpl): + original_process_weights(act_dtype) + self.mla_attn.impl.process_weights_after_loading(act_dtype) - self.mla_attn.process_weights_after_loading = wrapped_process_weights + self.mla_attn.process_weights_after_loading = wrapped_process_weights compilation_config = get_current_vllm_config().compilation_config if prefix in compilation_config.static_forward_context: diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py index 0f12e27c658..52d9a74b156 100644 --- a/vllm_ascend/patch/platform/__init__.py +++ b/vllm_ascend/patch/platform/__init__.py @@ -19,11 +19,6 @@ import vllm_ascend.patch.platform.patch_distributed # noqa import vllm_ascend.patch.platform.patch_mamba_config # noqa import vllm_ascend.patch.platform.patch_sched_yield # noqa -from vllm_ascend import envs -from vllm_ascend.utils import vllm_version_is if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true": import vllm_ascend.patch.platform.patch_multiproc_executor # noqa - -if envs.VLLM_ASCEND_BALANCE_SCHEDULING and vllm_version_is("0.15.0"): - import vllm_ascend.patch.platform.patch_balance_schedule # noqa diff --git a/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py b/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py index e87c5f3fd4f..1bd00e0c058 100644 --- a/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py +++ b/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py @@ -1,14 +1,8 @@ import torch import vllm.v1.worker.utils as utils +from vllm.model_executor.layers.attention import Attention from vllm.v1.worker.utils import defaultdict, extract_layer_index -from vllm_ascend.utils import vllm_version_is - -if vllm_version_is("v0.15.0"): - from vllm.attention.layer import Attention # type: ignore -else: - from vllm.model_executor.layers.attention import Attention - # Without this patch, it will raise an exception when initialize kv_cache. # TODO To remove the patch, we need check why the original bind_kv_cache raises an NotImplementedError. diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 9bfaa0bc2e1..88df5706cc7 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -481,7 +481,7 @@ def import_kernels(cls) -> None: _CUSTOM_OP_REGISTERED = True @classmethod - def get_attn_backend_cls(cls, selected_backend, attn_selector_config): + def get_attn_backend_cls(cls, selected_backend, attn_selector_config, num_heads: int | None = None): key = (attn_selector_config.use_mla, attn_selector_config.use_sparse) backend_map = { diff --git a/vllm_ascend/quantization/modelslim_config.py b/vllm_ascend/quantization/modelslim_config.py index 0541bfb43ae..3f8ef78848b 100644 --- a/vllm_ascend/quantization/modelslim_config.py +++ b/vllm_ascend/quantization/modelslim_config.py @@ -390,12 +390,7 @@ def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["Qua self.packed_modules_mapping = packed_modules_model_mapping[model_type] prefix = self.quant_prefix_mapper(model_type, prefix) - from vllm_ascend.utils import vllm_version_is - - if vllm_version_is("v0.15.0"): - from vllm.attention.layer import Attention # type: ignore - else: - from vllm.model_executor.layers.attention import Attention + from vllm.model_executor.layers.attention import Attention if model_type != "kimi_k2": if prefix.startswith("language_model"): diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 73a080b403f..9a6a5ec1da6 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -41,7 +41,7 @@ from vllm_ascend.compilation.acl_graph import ACLGraphWrapper, update_full_graph_params from vllm_ascend.ops.triton.spec_decode.utils import prepare_inputs_padded_kernel from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num -from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled, vllm_version_is +from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled # Currently we will fix block size to a small one since `num_reqs` can't be too large _PREPARE_INPUTS_BLOCK_SIZE = 4 @@ -357,11 +357,10 @@ def dummy_run( is_draft_model=True, draft_attn_metadatas=multi_steps_attn_metadata, ): - if not vllm_version_is("v0.15.0"): - # Reset MOE layer index before first model call - forward_context = get_forward_context() - if forward_context is not None: - forward_context.moe_layer_index = 0 + # Reset MOE layer index before first model call + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 self._runnable( num_input_tokens=num_tokens, @@ -507,11 +506,10 @@ def _propose( is_draft_model=True, draft_attn_metadatas=multi_steps_attn_metadata, ): - if not vllm_version_is("v0.15.0"): - # Reset MOE layer index for forward pass - forward_context = get_forward_context() - if forward_context is not None: - forward_context.moe_layer_index = 0 + # Reset MOE layer index for forward pass + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 draft_token_ids = self._runnable( num_input_tokens=num_input_tokens, @@ -602,11 +600,10 @@ def _run_merged_draft( forward_context.num_accept_tokens = batch_size for draft_step in range(self.num_speculative_tokens - 1): - if not vllm_version_is("v0.15.0"): - # Reset MOE layer index for each draft step iteration - forward_context = get_forward_context() - if forward_context is not None: - forward_context.moe_layer_index = 0 + # Reset MOE layer index for each draft step iteration + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 # Update the inputs. # cast to int32 is crucial when eagle model is compiled. diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index 873cb54336a..d27d4cbb504 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -16,7 +16,7 @@ from vllm_ascend.compilation.acl_graph import ACLGraphWrapper from vllm_ascend.ops.rotary_embedding import get_cos_and_sin_mla from vllm_ascend.spec_decode.eagle_proposer import EagleProposer -from vllm_ascend.utils import lmhead_tp_enable, vllm_version_is +from vllm_ascend.utils import lmhead_tp_enable class MtpProposer(EagleProposer): @@ -130,11 +130,10 @@ def dummy_run( is_draft_model=True, in_profile_run=is_profile, ): - if not vllm_version_is("v0.15.0"): - # Reset MOE layer index for each MTP step iteration - forward_context = get_forward_context() - if forward_context is not None: - forward_context.moe_layer_index = 0 + # Reset MOE layer index for each MTP step iteration + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 previous_hidden_states, positions = self.maybe_pad_and_reduce(previous_hidden_states, positions) self.model(input_ids=input_ids, positions=positions, hidden_states=previous_hidden_states) forward_context = get_forward_context() @@ -340,11 +339,10 @@ def _propose( num_actual_tokens=num_tokens, is_draft_model=True, ): - if not vllm_version_is("v0.15.0"): - # Reset MOE layer index for each MTP step to match all_moe_layers registration - forward_context = get_forward_context() - if forward_context is not None: - forward_context.moe_layer_index = 0 + # Reset MOE layer index for each MTP step to match all_moe_layers registration + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 with record_function_or_nullcontext("mtp_forward"): model_kwargs = {} diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 6a9a236c93a..bd63cc7f359 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -525,12 +525,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: "increase the number of supported shapes, set HCCL_OP_EXPANSION_MODE=AIV." ) - from vllm_ascend.utils import vllm_version_is - - if vllm_version_is("0.15.0"): - arch_name = vllm_config.model_config.architectures[0] - else: - arch_name = vllm_config.model_config.architecture + arch_name = vllm_config.model_config.architecture # If original sizes exceed maximum, sample a representative subset if max_num_batch_sizes < len(original_sizes): diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 1f0ff0bfe10..d33205cb260 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -136,12 +136,8 @@ else: xgr = LazyLoader("xgr", globals(), "xgrammar") -from vllm_ascend.utils import vllm_version_is -if vllm_version_is("v0.15.0"): - from vllm.attention.layer import Attention, MLAAttention # type: ignore -else: - from vllm.model_executor.layers.attention import Attention, MLAAttention +from vllm.model_executor.layers.attention import Attention, MLAAttention # if true, allow tensor initialization and casting with internal format (e.g., NZ) torch.npu.config.allow_internal_format = True