diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index c3f15a90f13..879bc4efe44 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -80,7 +80,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8, v0.19.1] + vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml @@ -102,7 +102,7 @@ jobs: strategy: fail-fast: false matrix: - vllm_version: [v0.19.1] + vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8] needs: [parse-trigger] if: ${{ needs.parse-trigger.outputs.allowed == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index bc51592122f..2f6678c126c 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -154,7 +154,7 @@ jobs: if: ${{ needs.lint.result == 'success' && needs.changes.outputs.has_tests == 'true' }} strategy: matrix: - vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8, v0.19.1] + vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8] uses: ./.github/workflows/_optional_smart_e2e.yaml with: vllm: ${{ matrix.vllm_version }} @@ -164,7 +164,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8, v0.19.1] + vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_update_estimated_time.yaml b/.github/workflows/schedule_update_estimated_time.yaml index 79586a891ec..b8a18b4c71b 100644 --- a/.github/workflows/schedule_update_estimated_time.yaml +++ b/.github/workflows/schedule_update_estimated_time.yaml @@ -23,7 +23,7 @@ jobs: name: e2e-test strategy: matrix: - vllm_version: [v0.19.1] + vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8] type: [full, light] uses: ./.github/workflows/_e2e_test.yaml with: diff --git a/.github/workflows/schedule_vllm_e2e_test.yaml b/.github/workflows/schedule_vllm_e2e_test.yaml index 8e610ff8b88..253aaf59df1 100644 --- a/.github/workflows/schedule_vllm_e2e_test.yaml +++ b/.github/workflows/schedule_vllm_e2e_test.yaml @@ -45,7 +45,7 @@ jobs: fail-fast: false matrix: part: [0, 1, 2, 3] - vllm: [v0.19.1] + vllm: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8] container: image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11 env: diff --git a/Dockerfile b/Dockerfile index 473d7b084a5..221cafb89ba 100644 --- a/Dockerfile +++ b/Dockerfile @@ -48,8 +48,12 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.19.1 -RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm +# ARG VLLM_TAG=v0.19.1 +# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm +ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8 +RUN git init /vllm-workspace/vllm && \ + git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ + git -C /vllm-workspace/vllm checkout FETCH_HEAD # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.310p b/Dockerfile.310p index 2362a579109..79000a1eb53 100644 --- a/Dockerfile.310p +++ b/Dockerfile.310p @@ -33,8 +33,12 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.19.1 -RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm +# ARG VLLM_TAG=v0.19.1 +# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm +ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8 +RUN git init /vllm-workspace/vllm && \ + git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ + git -C /vllm-workspace/vllm checkout FETCH_HEAD # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler index 75865bc9f82..27ec4290229 100644 --- a/Dockerfile.310p.openEuler +++ b/Dockerfile.310p.openEuler @@ -32,8 +32,12 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.19.1 -RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm +# ARG VLLM_TAG=v0.19.1 +# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm +ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8 +RUN git init /vllm-workspace/vllm && \ + git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ + git -C /vllm-workspace/vllm checkout FETCH_HEAD # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.a3 b/Dockerfile.a3 index cda1c8d8b3f..eabf42a0874 100644 --- a/Dockerfile.a3 +++ b/Dockerfile.a3 @@ -50,8 +50,12 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.19.1 -RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm +# ARG VLLM_TAG=v0.19.1 +# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm +ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8 +RUN git init /vllm-workspace/vllm && \ + git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ + git -C /vllm-workspace/vllm checkout FETCH_HEAD # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler index 0f30484a589..270a42672ca 100644 --- a/Dockerfile.a3.openEuler +++ b/Dockerfile.a3.openEuler @@ -49,8 +49,12 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.19.1 -RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm +# ARG VLLM_TAG=v0.19.1 +# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm +ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8 +RUN git init /vllm-workspace/vllm && \ + git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ + git -C /vllm-workspace/vllm checkout FETCH_HEAD # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index 47d0d2a79c0..ec5cabbe308 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -49,8 +49,12 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.19.1 -RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm +# ARG VLLM_TAG=v0.19.1 +# RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm +ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8 +RUN git init /vllm-workspace/vllm && \ + git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ + git -C /vllm-workspace/vllm checkout FETCH_HEAD # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 354ee1c1179..8f5bbb5b6be 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -62,7 +62,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | Triton Ascend | |-------------|--------------|------------------|-------------|--------------------|---------------| -| main | {{main_vllm_commit}}, {{main_vllm_tag}} | {{main_python_version}} | {{main_cann_version}} | {{main_pytorch_torch_npu_version}} | {{main_triton_ascend_version}} | +| main | {{main_vllm_commit}} | {{main_python_version}} | {{main_cann_version}} | {{main_pytorch_torch_npu_version}} | {{main_triton_ascend_version}} | ## Release cadence diff --git a/tests/e2e/singlecard/model_runner_v2/test_basic.py b/tests/e2e/singlecard/model_runner_v2/test_basic.py index 034a5350df8..3edf1b4efc7 100644 --- a/tests/e2e/singlecard/model_runner_v2/test_basic.py +++ b/tests/e2e/singlecard/model_runner_v2/test_basic.py @@ -22,7 +22,6 @@ from vllm import SamplingParams from tests.e2e.conftest import VllmRunner -from vllm_ascend.utils import vllm_version_is MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"] @@ -30,7 +29,6 @@ EGALE_MODELS = ["vllm-ascend/EAGLE-LLaMA3.1-Instruct-8B"] -@pytest.mark.skipif(vllm_version_is("0.19.1"), reason="no need to support model_runner for v0.19.1") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("enforce_eager", [True]) @@ -65,7 +63,6 @@ def test_qwen3_dense_eager_mode( runner.model.generate(prompts, sampling_params) -@pytest.mark.skipif(vllm_version_is("0.19.1"), reason="no need to support model_runner for v0.19.1") @pytest.mark.parametrize("model", MAIN_MODELS) @pytest.mark.parametrize("eagle_model", EGALE_MODELS) @pytest.mark.parametrize("max_tokens", [32]) @@ -104,7 +101,6 @@ def test_egale_spec_decoding( runner.model.generate(prompts, sampling_params) -@pytest.mark.skipif(vllm_version_is("0.19.1"), reason="no need to support model_runner for v0.19.1") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("enforce_eager", [False]) diff --git a/tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py b/tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py index 806b20fa26a..2203a8e054c 100644 --- a/tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py +++ b/tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py @@ -625,11 +625,6 @@ def test_dflash_acceptance( method: str, num_speculative_tokens: int, ): - from vllm_ascend.utils import vllm_version_is - - if vllm_version_is("0.19.1"): - pytest.skip("Dflash tests are not supported on vLLM version 0.19.1") - main_model_name = DFLASH[method]["main"] spec_model_name = DFLASH[method]["spec"] diff --git a/tests/ut/ops/test_mla.py b/tests/ut/ops/test_mla.py index f3e553d8aa5..a79d582b411 100644 --- a/tests/ut/ops/test_mla.py +++ b/tests/ut/ops/test_mla.py @@ -8,7 +8,6 @@ from tests.ut.base import TestBase from vllm_ascend.ops.mla import AscendMultiHeadLatentAttention, IndexerWrapper -from vllm_ascend.utils import vllm_version_is class TestIndexerWrapper(TestBase): @@ -19,11 +18,7 @@ def test_initialization(self): mock_indexer.topk_tokens = 2048 mock_indexer.q_lora_rank = 1536 mock_indexer.wq_b = nn.Linear(128, 128) - if vllm_version_is("0.19.1"): - mock_indexer.wk = nn.Linear(128, 128) - mock_indexer.weights_proj = nn.Linear(128, 128) - else: - mock_indexer.wk_weights_proj = nn.Linear(128, 128) + mock_indexer.wk_weights_proj = nn.Linear(128, 128) mock_indexer.k_norm = nn.LayerNorm(128) mock_indexer.softmax_scale = 0.123 mock_indexer.topk_indices_buffer = torch.randn(10) @@ -36,11 +31,7 @@ def test_initialization(self): self.assertEqual(wrapper.topk_tokens, 2048) self.assertEqual(wrapper.q_lora_rank, 1536) self.assertIs(wrapper.wq_b, mock_indexer.wq_b) - if vllm_version_is("0.19.1"): - self.assertIs(wrapper.wk, mock_indexer.wk) - self.assertIs(wrapper.weights_proj, mock_indexer.weights_proj) - else: - self.assertIs(wrapper.wk_weights_proj, mock_indexer.wk_weights_proj) + self.assertIs(wrapper.wk_weights_proj, mock_indexer.wk_weights_proj) self.assertIs(wrapper.k_norm, mock_indexer.k_norm) self.assertEqual(wrapper.softmax_scale, 0.123) diff --git a/vllm_ascend/_310p/fused_moe/fused_moe.py b/vllm_ascend/_310p/fused_moe/fused_moe.py index 224647beb29..7c81d6a7336 100644 --- a/vllm_ascend/_310p/fused_moe/fused_moe.py +++ b/vllm_ascend/_310p/fused_moe/fused_moe.py @@ -27,7 +27,6 @@ from vllm_ascend.ops.fused_moe.moe_comm_method import FusedExpertsResult, _MoECommMethods from vllm_ascend.ops.fused_moe.moe_runtime_args import build_fused_experts_input from vllm_ascend.quantization.quant_type import QuantType -from vllm_ascend.utils import vllm_version_is from .experts_selector import select_experts from .moe_comm_method import AllGatherCommImpl310 @@ -164,14 +163,13 @@ def __init__(self, *args, **kwargs): from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner - is_legacy = vllm_version_is("0.19.1") self.runner = AscendMoERunner( - self if is_legacy else self.layer_name, + self.layer_name, self.moe_config, self.router, self._routed_input_transform, - self.gate if is_legacy else kwargs.pop("gate", None), - self.shared_experts if is_legacy else kwargs.pop("shared_experts", None), + kwargs.pop("gate", None), + kwargs.pop("shared_experts", None), self.quant_method, self.reduce_results, self.vllm_config.parallel_config.enable_dbo, @@ -285,9 +283,8 @@ def __init__( # which at this point is still the stale runner built with shared_experts=None. from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner - is_legacy = vllm_version_is("0.19.1") self.runner = AscendMoERunner( - self if is_legacy else self.layer_name, + self.layer_name, self.moe_config, self.router, self._routed_input_transform, diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index ceea3511bfe..0b446a87a9c 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -20,7 +20,6 @@ is_drafter_moe_model, is_moe_model, speculative_enable_dispatch_gmm_combine_decode, - vllm_version_is, ) @@ -156,10 +155,7 @@ def set_ascend_forward_context( dp_world_size = get_dp_group().world_size if dp_world_size > 1 and forward_context.dp_metadata is not None: dp_meta = forward_context.dp_metadata - if vllm_version_is("0.19.1"): - max_tokens_across_dp = dp_meta.max_tokens_across_dp_cpu.item() - else: - max_tokens_across_dp = dp_meta.num_tokens_across_dp_cpu.max().item() + max_tokens_across_dp = dp_meta.num_tokens_across_dp_cpu.max().item() if forward_context.flash_comm_v1_enabled or forward_context.flashcomm_v2_enabled: padded_length = (max_tokens_across_dp + tp_world_size - 1) // tp_world_size * tp_world_size pad_size = padded_length - num_tokens diff --git a/vllm_ascend/attention/context_parallel/sfa_cp.py b/vllm_ascend/attention/context_parallel/sfa_cp.py index 840bbae7893..83568a07acd 100644 --- a/vllm_ascend/attention/context_parallel/sfa_cp.py +++ b/vllm_ascend/attention/context_parallel/sfa_cp.py @@ -12,7 +12,6 @@ from vllm_ascend.attention.sfa_v1 import AscendSFAImpl, AscendSFAMetadata, AscendSFAMetadataBuilder from vllm_ascend.attention.utils import AscendCommonAttentionMetadata, enabling_mlapo, split_decodes_and_prefills from vllm_ascend.ops.triton.rope import rope_forward_triton_siso -from vllm_ascend.utils import vllm_version_is M = TypeVar("M", bound=AscendSFAMetadata) @@ -414,12 +413,8 @@ def indexer_select_post_process( actual_seq_lengths_query: torch.Tensor, actual_seq_lengths_key: torch.Tensor, ): - if vllm_version_is("0.19.1"): - weights, _ = self.weights_proj(x) - else: - kw, _ = self.wk_weights_proj(x) - weights = kw[:, self.head_dim :] - + kw, _ = self.wk_weights_proj(x) + weights = kw[:, self.head_dim :] q_li, _ = self.wq_b(q_c) # [b,s,1536] @ [1536,64*128] = [b,s,64*128] q_li = q_li.view(-1, self.n_head, self.head_dim) # [n_toks,64,128] if HAS_TRITON: diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py index 7b1da4fb18b..5506ae377b9 100644 --- a/vllm_ascend/attention/sfa_v1.py +++ b/vllm_ascend/attention/sfa_v1.py @@ -55,7 +55,6 @@ enable_dsa_cp_with_o_proj_tp, get_weight_prefetch_method, maybe_trans_nz, - vllm_version_is, ) from vllm_ascend.worker.npu_input_batch import NPUInputBatch @@ -439,11 +438,7 @@ def __init__( self.n_head: int = self.indexer.n_head # 64 self.head_dim: int = self.indexer.head_dim # 128 self.wq_b = self.indexer.wq_b - if vllm_version_is("0.19.1"): - self.wk = self.indexer.wk - self.weights_proj = self.indexer.weights_proj - else: - self.wk_weights_proj = self.indexer.wk_weights_proj + self.wk_weights_proj = self.indexer.wk_weights_proj self.k_norm = self.indexer.k_norm self.cp_size = 1 self.is_rope_neox_style = True @@ -912,11 +907,8 @@ def indexer_select_pre_process( cos: torch.Tensor, sin: torch.Tensor, ): - if vllm_version_is("0.19.1"): - k_li, _ = self.wk(x) # [b,s,7168] @ [7168,128] = [b,s,128] - else: - kw, _ = self.wk_weights_proj(x) - k_li = kw[:, : self.head_dim] + kw, _ = self.wk_weights_proj(x) + k_li = kw[:, : self.head_dim] k_li = self.k_norm(k_li).unsqueeze(1) k_li = k_li.view(-1, 1, self.head_dim) @@ -961,12 +953,8 @@ def indexer_select_post_process( actual_seq_lengths_query: torch.Tensor, actual_seq_lengths_key: torch.Tensor, ): - if vllm_version_is("0.19.1"): - weights, _ = self.weights_proj(x) - else: - kw, _ = self.wk_weights_proj(x) - weights = kw[:, self.head_dim :] - + kw, _ = self.wk_weights_proj(x) + weights = kw[:, self.head_dim :] q_li, _ = self.wq_b(q_c) # [b,s,1536] @ [1536,64*128] = [b,s,64*128] q_li = q_li.view(-1, self.n_head, self.head_dim) # [n_toks,64,128] if HAS_TRITON: diff --git a/vllm_ascend/core/recompute_scheduler.py b/vllm_ascend/core/recompute_scheduler.py index ae6d1e669af..8cb23e82b4e 100644 --- a/vllm_ascend/core/recompute_scheduler.py +++ b/vllm_ascend/core/recompute_scheduler.py @@ -46,8 +46,6 @@ from vllm.v1.spec_decode.metrics import SpecDecodingStats from vllm.v1.utils import ConstantList, record_function_or_nullcontext -from vllm_ascend.utils import vllm_version_is - # `spec_manager_map` in single_type_kv_cache_manager is a module-level dict # whose keys are class objects bound at import time. When the async @@ -209,11 +207,6 @@ def _update_waiting_for_remote_kv(self, request: Request) -> None: # Update the request state for scheduling. request.num_computed_tokens = num_computed_tokens - if vllm_version_is("0.19.1"): - # Count the number of prefix cached tokens. - if request.num_cached_tokens < 0: - request.num_cached_tokens = request.num_computed_tokens - self.finished_recving_kv_req_ids.remove(request.request_id) def schedule(self) -> RecomputeSchedulerOutput: @@ -500,11 +493,7 @@ def schedule(self) -> RecomputeSchedulerOutput: request_queue.pop_request() step_skipped_waiting.prepend_request(request) continue - - if vllm_version_is("0.19.1"): - request.num_external_computed_tokens = ext_tokens num_external_computed_tokens = ext_tokens - connector_prefix_cache_queries = request.num_tokens - num_new_local_computed_tokens connector_prefix_cache_hits = num_external_computed_tokens @@ -512,7 +501,7 @@ def schedule(self) -> RecomputeSchedulerOutput: num_computed_tokens = num_new_local_computed_tokens + num_external_computed_tokens assert num_computed_tokens <= request.num_tokens - if not vllm_version_is("0.19.1") and request.prefill_stats is not None: + if request.prefill_stats is not None: request.prefill_stats.set( num_prompt_tokens=request.num_prompt_tokens, num_local_cached_tokens=num_new_local_computed_tokens, @@ -691,10 +680,6 @@ def schedule(self) -> RecomputeSchedulerOutput: token_budget -= num_new_tokens request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens - if vllm_version_is("0.19.1"): - # Count the number of prefix cached tokens. - if request.num_cached_tokens < 0: - request.num_cached_tokens = num_computed_tokens # Encoder-related. if encoder_inputs_to_schedule: scheduled_encoder_inputs[request_id] = encoder_inputs_to_schedule @@ -956,11 +941,7 @@ def update_from_output( if new_token_ids or pooler_output is not None or kv_transfer_params or stopped: # Add EngineCoreOutput for this Request. prefill_kwargs: dict = {} - if not vllm_version_is("0.19.1"): - prefill_kwargs["prefill_stats"] = request.take_prefill_stats() - else: - prefill_kwargs["num_cached_tokens"] = request.num_cached_tokens - prefill_kwargs["num_external_computed_tokens"] = request.num_external_computed_tokens + prefill_kwargs["prefill_stats"] = request.take_prefill_stats() outputs[request.client_index].append( EngineCoreOutput( request_id=req_id, @@ -994,8 +975,6 @@ def update_from_output( self.finish_requests(failed_kv_load_req_ids, RequestStatus.FINISHED_ERROR) for request in requests: prefill_kwargs = {} - if vllm_version_is("0.19.1"): - prefill_kwargs["num_cached_tokens"] = request.num_cached_tokens outputs[request.client_index].append( EngineCoreOutput( request_id=request.request_id, diff --git a/vllm_ascend/core/scheduler_dynamic_batch.py b/vllm_ascend/core/scheduler_dynamic_batch.py index a055b2fde6d..1a9260d947d 100644 --- a/vllm_ascend/core/scheduler_dynamic_batch.py +++ b/vllm_ascend/core/scheduler_dynamic_batch.py @@ -31,8 +31,6 @@ from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager -from vllm_ascend.utils import vllm_version_is - class BudgetRefiner: """This budget refiner can make dynamic adjustment to the token budget @@ -491,10 +489,6 @@ def schedule(self) -> SchedulerOutput: token_budget -= num_new_tokens request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens - if vllm_version_is("0.19.1"): - # Count the number of prefix cached tokens. - if request.num_cached_tokens < 0: - request.num_cached_tokens = num_computed_tokens # Encoder-related. if encoder_inputs_to_schedule: scheduled_encoder_inputs[request.request_id] = encoder_inputs_to_schedule diff --git a/vllm_ascend/core/scheduler_profiling_chunk.py b/vllm_ascend/core/scheduler_profiling_chunk.py index 02b891f74a4..df9766e20ca 100644 --- a/vllm_ascend/core/scheduler_profiling_chunk.py +++ b/vllm_ascend/core/scheduler_profiling_chunk.py @@ -41,7 +41,6 @@ from vllm.v1.utils import record_function_or_nullcontext from vllm_ascend.core.profiling_chunk_predictor import ProfilingChunkManager -from vllm_ascend.utils import vllm_version_is class ProfilingChunkScheduler(Scheduler): @@ -482,9 +481,6 @@ def schedule(self) -> SchedulerOutput: # noqa: C901 request_queue.pop_request() step_skipped_waiting.prepend_request(request) continue - - if vllm_version_is("0.19.1"): - request.num_external_computed_tokens = ext_tokens num_external_computed_tokens = ext_tokens connector_prefix_cache_queries = request.num_tokens - num_new_local_computed_tokens @@ -492,7 +488,7 @@ def schedule(self) -> SchedulerOutput: # noqa: C901 num_computed_tokens = num_new_local_computed_tokens + num_external_computed_tokens - if not vllm_version_is("0.19.1") and request.prefill_stats is not None: + if request.prefill_stats is not None: request.prefill_stats.set( num_prompt_tokens=request.num_prompt_tokens, num_local_cached_tokens=num_new_local_computed_tokens, @@ -637,9 +633,6 @@ def schedule(self) -> SchedulerOutput: # noqa: C901 time_budget -= self.profiling_chunk_manager.predict_time(num_new_tokens, request.num_computed_tokens) request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens - if vllm_version_is("0.19.1"): - if request.num_cached_tokens < 0: - request.num_cached_tokens = num_computed_tokens if encoder_inputs_to_schedule: scheduled_encoder_inputs[request_id] = encoder_inputs_to_schedule for i in encoder_inputs_to_schedule: diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index b0d7a946b33..8a4173fe9cf 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -49,7 +49,6 @@ npu_stream_switch, shared_expert_dp_enabled, shared_experts_calculation_stream, - vllm_version_is, ) @@ -357,14 +356,13 @@ def __init__(self, *args, **kwargs): setup_moe_comm_method(self.moe_config) self.quant_type = self._get_quant_type() - is_legacy = vllm_version_is("0.19.1") self.runner = AscendMoERunner( - self if is_legacy else self.layer_name, + self.layer_name, self.moe_config, self.router, self._routed_input_transform, - self.gate if is_legacy else kwargs.pop("gate", None), - self.shared_experts if is_legacy else kwargs.pop("shared_experts", None), + kwargs.pop("gate", None), + kwargs.pop("shared_experts", None), self.quant_method, self.reduce_results, self.vllm_config.parallel_config.enable_dbo, @@ -583,9 +581,8 @@ def __init__( # NOTE: must use self._shared_experts here, not self.shared_experts — # FusedMoE.shared_experts is a property that reads self.runner.shared_experts, # which at this point is still the stale runner built with shared_experts=None. - is_legacy = vllm_version_is("0.19.1") self.runner = AscendMoERunner( - self if is_legacy else self.layer_name, + self.layer_name, self.moe_config, self.router, self._routed_input_transform, diff --git a/vllm_ascend/ops/mla.py b/vllm_ascend/ops/mla.py index 85a08dc194a..047b27eea9e 100644 --- a/vllm_ascend/ops/mla.py +++ b/vllm_ascend/ops/mla.py @@ -33,7 +33,7 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_forward_context import _EXTRA_CTX -from vllm_ascend.utils import is_vl_model, parse_layer_idx, vllm_version_is +from vllm_ascend.utils import is_vl_model, parse_layer_idx class IndexerWrapper(nn.Module): @@ -54,12 +54,7 @@ def __init__(self, vllm_indexer: nn.Module) -> None: self.topk_tokens: int = vllm_indexer.topk_tokens # 2048 self.q_lora_rank: int = vllm_indexer.q_lora_rank # 1536 self.wq_b = vllm_indexer.wq_b - # upstream ac3dac545 fused wk+weights_proj into wk_weights_proj - if vllm_version_is("0.19.1"): - self.wk = vllm_indexer.wk - self.weights_proj = vllm_indexer.weights_proj - else: - self.wk_weights_proj = vllm_indexer.wk_weights_proj + self.wk_weights_proj = vllm_indexer.wk_weights_proj self.k_norm = vllm_indexer.k_norm self.softmax_scale = vllm_indexer.softmax_scale vllm_indexer.topk_indices_buffer = None # delete topk_indices_buffer diff --git a/vllm_ascend/patch/platform/patch_balance_schedule.py b/vllm_ascend/patch/platform/patch_balance_schedule.py index a590231abb8..5711352aff3 100644 --- a/vllm_ascend/patch/platform/patch_balance_schedule.py +++ b/vllm_ascend/patch/platform/patch_balance_schedule.py @@ -24,8 +24,6 @@ from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.utils import record_function_or_nullcontext -from vllm_ascend.utils import vllm_version_is - class BalanceScheduler(Scheduler): def __init__( @@ -351,17 +349,14 @@ def schedule(self) -> SchedulerOutput: skipped_waiting_requests.prepend_request(request) continue - if vllm_version_is("0.19.1"): - request.num_external_computed_tokens = ext_tokens num_external_computed_tokens = ext_tokens - connector_prefix_cache_queries = request.num_tokens - num_new_local_computed_tokens connector_prefix_cache_hits = num_external_computed_tokens # Total computed tokens (local + external). num_computed_tokens = num_new_local_computed_tokens + num_external_computed_tokens - if not vllm_version_is("0.19.1") and request.prefill_stats is not None: + if request.prefill_stats is not None: request.prefill_stats.set( num_prompt_tokens=request.num_prompt_tokens, num_local_cached_tokens=num_new_local_computed_tokens, @@ -506,10 +501,6 @@ def schedule(self) -> SchedulerOutput: token_budget -= num_new_tokens request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens - if vllm_version_is("0.19.1"): - # Count the number of prefix cached tokens. - if request.num_cached_tokens < 0: - request.num_cached_tokens = num_computed_tokens # Encoder-related. if encoder_inputs_to_schedule: scheduled_encoder_inputs[request_id] = encoder_inputs_to_schedule diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index 5f27adcdc82..ff0c51d15df 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -17,7 +17,7 @@ from vllm.triton_utils import HAS_TRITON -from vllm_ascend.utils import is_310p, vllm_version_is +from vllm_ascend.utils import is_310p if HAS_TRITON: import vllm_ascend.patch.worker.patch_triton @@ -38,9 +38,8 @@ if not is_310p(): import vllm_ascend.patch.worker.patch_qwen3_5 # noqa import vllm_ascend.patch.worker.patch_gdn_attn # noqa + import vllm_ascend.patch.worker.patch_qwen3_dflash # noqa - if not vllm_version_is("0.19.1"): - import vllm_ascend.patch.worker.patch_qwen3_dflash # noqa import vllm_ascend.patch.worker.patch_rejection_sampler # noqa import vllm_ascend.patch.worker.patch_v2.patch_uva # noqa import vllm_ascend.patch.worker.patch_huanyuan_vl # noqa diff --git a/vllm_ascend/patch/worker/patch_qwen3vl.py b/vllm_ascend/patch/worker/patch_qwen3vl.py index 5ef8e97abc8..b5b77d8d1e5 100644 --- a/vllm_ascend/patch/worker/patch_qwen3vl.py +++ b/vllm_ascend/patch/worker/patch_qwen3vl.py @@ -5,11 +5,11 @@ from vllm.model_executor.models.qwen3_vl import ( Qwen3_VisionTransformer, Qwen3VLForConditionalGeneration, + pos_embed_interpolate_native, ) from vllm_ascend.ascend_forward_context import _EXTRA_CTX from vllm_ascend.ops.rotary_embedding import AscendMRotaryEmbedding -from vllm_ascend.utils import vllm_version_is def tensor_parallel_wrap(func): @@ -73,24 +73,22 @@ def forward_with_split_qkv_rmsnorm_mrope(self, positions: torch.Tensor, hidden_s Qwen3VLForConditionalGeneration._get_deepstack_input_embeds ) -if not vllm_version_is("0.19.1"): - # Only patch for latest main - from vllm.model_executor.models.qwen3_vl import pos_embed_interpolate_native - def _fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor: - outputs = [] - for t, h, w in grid_thw: - outputs.append( - pos_embed_interpolate_native( - self.pos_embed.weight, - t, - h, - w, - self.num_grid_per_side, - self.spatial_merge_size, - self.dtype, - ) +def _fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor: + outputs = [] + for t, h, w in grid_thw: + outputs.append( + pos_embed_interpolate_native( + self.pos_embed.weight, + t, + h, + w, + self.num_grid_per_side, + self.spatial_merge_size, + self.dtype, ) - return torch.cat(outputs, dim=0) + ) + return torch.cat(outputs, dim=0) + - Qwen3_VisionTransformer.fast_pos_embed_interpolate = _fast_pos_embed_interpolate +Qwen3_VisionTransformer.fast_pos_embed_interpolate = _fast_pos_embed_interpolate diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 0ba64ee424c..08c33b9d530 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -48,7 +48,6 @@ update_cudagraph_capture_sizes, is_310p, enable_sp, - vllm_version_is, ) if TYPE_CHECKING: @@ -757,10 +756,7 @@ def set_additional_forward_context( num_tokens = list(attn_metadata.values())[0].num_actual_tokens dp_world_size = get_dp_group().world_size if dp_world_size > 1 and dp_metadata is not None: - if vllm_version_is("0.19.1"): - max_tokens_across_dp = dp_metadata.max_tokens_across_dp_cpu.item() - else: - max_tokens_across_dp = dp_metadata.num_tokens_across_dp_cpu.max().item() + max_tokens_across_dp = dp_metadata.num_tokens_across_dp_cpu.max().item() if flash_comm_v1_enabled or flashcomm_v2_enabled: padded_length = (max_tokens_across_dp + tp_world_size - 1) // tp_world_size * tp_world_size pad_size = padded_length - num_tokens diff --git a/vllm_ascend/spec_decode/__init__.py b/vllm_ascend/spec_decode/__init__.py index 47929115690..9083e39587a 100644 --- a/vllm_ascend/spec_decode/__init__.py +++ b/vllm_ascend/spec_decode/__init__.py @@ -24,7 +24,6 @@ from vllm_ascend.spec_decode.medusa_proposer import AscendMedusaProposer from vllm_ascend.spec_decode.ngram_proposer import AscendNgramProposer from vllm_ascend.spec_decode.suffix_proposer import AscendSuffixDecodingProposer -from vllm_ascend.utils import vllm_version_is def get_spec_decode_method(method, vllm_config, device, runner): @@ -37,10 +36,7 @@ def get_spec_decode_method(method, vllm_config, device, runner): elif method in ("eagle", "eagle3", "mtp"): return AscendEagleProposer(vllm_config, device, runner) elif method == "dflash": - if not vllm_version_is("0.19.1"): - return AscendDflashProposer(vllm_config, device, runner) - else: - raise ValueError(f"VLLM v0.19.1 doesn't support {method} now") + return AscendDflashProposer(vllm_config, device, runner) elif method == "draft_model": return AscendDraftModelProposer(vllm_config, device, runner) else: diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 4a7385a537f..faff270b7a9 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -25,6 +25,7 @@ from vllm.model_executor.models import supports_multimodal from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM +from vllm.model_executor.models.qwen3_dflash import DFlashQwen3ForCausalLM from vllm.triton_utils import HAS_TRITON, triton from vllm.utils.math_utils import cdiv from vllm.utils.platform_utils import is_pin_memory_available @@ -47,12 +48,7 @@ from vllm_ascend.compilation.acl_graph import ACLGraphWrapper, update_full_graph_params from vllm_ascend.ops.triton.spec_decode.utils import prepare_inputs_padded_kernel from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num -from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled, vllm_version_is - -if not vllm_version_is("0.19.1"): - from vllm.model_executor.models.qwen3_dflash import DFlashQwen3ForCausalLM -else: - DFlashQwen3ForCausalLM = None +from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled # Currently we will fix block size to a small one since `num_reqs` can't be too large _PREPARE_INPUTS_BLOCK_SIZE = 4 diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index b1855f165e4..4a2f84ad798 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -44,7 +44,10 @@ from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput from vllm.v1.worker.gpu_worker import AsyncIntermediateTensors -from vllm.v1.worker.worker_base import WorkerBase +from vllm.v1.worker.worker_base import ( + CompilationTimes, # noqa: E402 + WorkerBase, +) from vllm.v1.worker.workspace import init_workspace_manager import vllm_ascend.envs as envs_ascend @@ -60,13 +63,9 @@ enable_sp, get_ascend_device_type, register_ascend_customop, - vllm_version_is, ) from vllm_ascend.worker.model_runner_v1 import NPUModelRunner -if not vllm_version_is("0.19.1"): - from vllm.v1.worker.worker_base import CompilationTimes # noqa: E402 - torch._dynamo.trace_rules.clear_lru_cache() # noqa: E402 from torch._dynamo.variables import TorchInGraphFunctionVariable # noqa: E402 from vllm.utils.torch_utils import set_random_seed # noqa: E402 @@ -554,8 +553,6 @@ def compile_or_warm_up_model(self): # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. set_random_seed(self.model_config.seed) - if vllm_version_is("0.19.1"): - return self.vllm_config.compilation_config.compilation_time return CompilationTimes( language_model=self.vllm_config.compilation_config.compilation_time,