diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 9ef5650e24b..ef7c621acf2 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -27,7 +27,7 @@ on: continue_on_error: required: false type: boolean - default: false + default: true # The following inputs are used by comment-triggered E2E tests (/e2e ). # They carry space-separated pytest paths, categorized by runner type. # Leave empty (default) when running label-triggered full/light suites. diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index af664c1dc32..53d7515f857 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8 +ARG VLLM_COMMIT=v0.20.1 RUN git init /vllm-workspace/vllm && \ git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ git -C /vllm-workspace/vllm checkout FETCH_HEAD diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 879bc4efe44..04bcdcfaf47 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -80,7 +80,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8] + vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml @@ -102,7 +102,7 @@ jobs: strategy: fail-fast: false matrix: - vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8] + vllm_version: [v0.20.1] needs: [parse-trigger] if: ${{ needs.parse-trigger.outputs.allowed == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 2f6678c126c..587ed100457 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: d886c26d4d4fef7d079696beb4ece1cfb4b008a8 + vllm: c7aa186d67b6f051680831418e957c67f34ba7a2 changes: runs-on: linux-aarch64-a2b3-0 container: @@ -154,7 +154,7 @@ jobs: if: ${{ needs.lint.result == 'success' && needs.changes.outputs.has_tests == 'true' }} strategy: matrix: - vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8] + vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1] uses: ./.github/workflows/_optional_smart_e2e.yaml with: vllm: ${{ matrix.vllm_version }} @@ -164,7 +164,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8] + vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_update_estimated_time.yaml b/.github/workflows/schedule_update_estimated_time.yaml index b8a18b4c71b..904472e03bb 100644 --- a/.github/workflows/schedule_update_estimated_time.yaml +++ b/.github/workflows/schedule_update_estimated_time.yaml @@ -23,7 +23,7 @@ jobs: name: e2e-test strategy: matrix: - vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8] + vllm_version: [v0.20.1] type: [full, light] uses: ./.github/workflows/_e2e_test.yaml with: diff --git a/.github/workflows/schedule_vllm_e2e_test.yaml b/.github/workflows/schedule_vllm_e2e_test.yaml index 2297498267c..1ba8468c990 100644 --- a/.github/workflows/schedule_vllm_e2e_test.yaml +++ b/.github/workflows/schedule_vllm_e2e_test.yaml @@ -45,7 +45,7 @@ jobs: fail-fast: false matrix: part: [0, 1, 2, 3] - vllm: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8] + vllm: [v0.20.1] container: image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11 env: diff --git a/.github/workflows/scripts/config.yaml b/.github/workflows/scripts/config.yaml index 29da9a4501c..93137fe7b80 100644 --- a/.github/workflows/scripts/config.yaml +++ b/.github/workflows/scripts/config.yaml @@ -31,8 +31,11 @@ e2e-singlecard: estimated_time: 222 - name: tests/e2e/singlecard/test_qwen3_multi_loras.py estimated_time: 100 -- name: tests/e2e/singlecard/test_models.py - estimated_time: 315 +- name: tests/e2e/singlecard/test_models.py::test_minicpm + estimated_time: 158 +- name: tests/e2e/singlecard/test_models.py::test_whisper + estimated_time: 157 + is_skipped: true - name: tests/e2e/singlecard/test_multistream_overlap_shared_expert.py estimated_time: 253 - name: tests/e2e/singlecard/test_quantization.py @@ -112,6 +115,7 @@ e2e-multicard-2-cards: estimated_time: 178 - name: tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek_w4a8_accuracy_tp2 estimated_time: 127 + is_skipped: true - name: tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_moe_fc2_tp2 estimated_time: 149 - name: tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek_v2_lite_fc1_tp2 @@ -130,8 +134,17 @@ e2e-multicard-2-cards: estimated_time: 400 - name: tests/e2e/multicard/2-cards/test_quantization.py estimated_time: 482 -- name: tests/e2e/multicard/2-cards/test_qwen3_moe.py - estimated_time: 974 +- name: tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_distributed_mp_tp2_ep + estimated_time: 195 +- name: tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_w8a8_distributed_tp2 + estimated_time: 195 +- name: tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_distributed_aiv_tp2 + estimated_time: 195 +- name: tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_distributed_tp2_ep2_mrv2 + estimated_time: 195 + is_skipped: true +- name: tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb + estimated_time: 194 - name: tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py estimated_time: 193 - name: tests/e2e/multicard/2-cards/test_single_request_aclgraph.py @@ -151,12 +164,35 @@ e2e-multicard-4-cards: estimated_time: 322 - name: tests/e2e/multicard/4-cards/test_kimi_k2.py estimated_time: 37 -- name: tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py - estimated_time: 1287 +- name: tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py::test_models_long_sequence_output_between_tp_and_cp + estimated_time: 257 + is_skipped: true +- name: tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py::test_accuracy_dcp_only_graph + estimated_time: 257 + is_skipped: true +- name: tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py::test_accuracy_dcp_only_eager + estimated_time: 257 + is_skipped: true +- name: tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py::test_accuracy_pcp_only + estimated_time: 257 + is_skipped: true +- name: tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py::test_models_long_sequence_cp_kv_interleave_size_output_between_tp_and_cp + estimated_time: 259 - name: tests/e2e/multicard/4-cards/long_sequence/test_basic.py estimated_time: 2179 -- name: tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill_cp.py - estimated_time: 1173 +- name: tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill_cp.py::test_models_chunked_prefill_mixed_length_prompts_including_1_token + estimated_time: 235 +- name: tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill_cp.py::test_models_chunked_prefill_with_empty_kvcache + estimated_time: 235 +- name: tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill_cp.py::test_models_chunked_prefill_with_cp_basic + estimated_time: 235 + is_skipped: true +- name: tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill_cp.py::test_models_chunked_prefill_with_cp_piecewise + estimated_time: 235 + is_skipped: true +- name: tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill_cp.py::test_models_chunked_prefill_with_cp_full_graph + estimated_time: 233 + is_skipped: true - name: tests/e2e/multicard/4-cards/long_sequence/test_prefix_caching_cp.py estimated_time: 850 - name: tests/e2e/multicard/4-cards/long_sequence/test_mtp.py diff --git a/Dockerfile b/Dockerfile index 221cafb89ba..d1f3eb3f5e9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,7 +50,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # ARG VLLM_TAG=v0.19.1 # RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm -ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8 +ARG VLLM_COMMIT=v0.20.1 RUN git init /vllm-workspace/vllm && \ git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ git -C /vllm-workspace/vllm checkout FETCH_HEAD diff --git a/Dockerfile.310p b/Dockerfile.310p index 79000a1eb53..8147b416e89 100644 --- a/Dockerfile.310p +++ b/Dockerfile.310p @@ -35,7 +35,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # ARG VLLM_TAG=v0.19.1 # RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm -ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8 +ARG VLLM_COMMIT=v0.20.1 RUN git init /vllm-workspace/vllm && \ git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ git -C /vllm-workspace/vllm checkout FETCH_HEAD diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler index 27ec4290229..7c87bb67740 100644 --- a/Dockerfile.310p.openEuler +++ b/Dockerfile.310p.openEuler @@ -34,7 +34,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # ARG VLLM_TAG=v0.19.1 # RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm -ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8 +ARG VLLM_COMMIT=v0.20.1 RUN git init /vllm-workspace/vllm && \ git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ git -C /vllm-workspace/vllm checkout FETCH_HEAD diff --git a/Dockerfile.a3 b/Dockerfile.a3 index eabf42a0874..a7f141235fb 100644 --- a/Dockerfile.a3 +++ b/Dockerfile.a3 @@ -52,7 +52,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # ARG VLLM_TAG=v0.19.1 # RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm -ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8 +ARG VLLM_COMMIT=v0.20.1 RUN git init /vllm-workspace/vllm && \ git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ git -C /vllm-workspace/vllm checkout FETCH_HEAD diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler index 270a42672ca..b0d0dc80d64 100644 --- a/Dockerfile.a3.openEuler +++ b/Dockerfile.a3.openEuler @@ -51,7 +51,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # ARG VLLM_TAG=v0.19.1 # RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm -ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8 +ARG VLLM_COMMIT=v0.20.1 RUN git init /vllm-workspace/vllm && \ git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ git -C /vllm-workspace/vllm checkout FETCH_HEAD diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index ec5cabbe308..3e1c5db0316 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -51,7 +51,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # ARG VLLM_TAG=v0.19.1 # RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm -ARG VLLM_COMMIT=d886c26d4d4fef7d079696beb4ece1cfb4b008a8 +ARG VLLM_COMMIT=v0.20.1 RUN git init /vllm-workspace/vllm && \ git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ git -C /vllm-workspace/vllm checkout FETCH_HEAD diff --git a/docs/source/conf.py b/docs/source/conf.py index fa37102cdeb..6bdd0ddaf0b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -81,9 +81,9 @@ # CANN image tag "cann_image_tag": "8.5.1-910b-ubuntu22.04-py3.11", # vLLM commit hash for main branch - "main_vllm_commit": "d886c26d4d4fef7d079696beb4ece1cfb4b008a8", + "main_vllm_commit": "c7aa186d67b6f051680831418e957c67f34ba7a2", # vLLM tag for main branch - "main_vllm_tag": "v0.19.1", + "main_vllm_tag": "v0.20.1", # Python version for main branch "main_python_version": ">= 3.10, < 3.12", # CANN version for main branch diff --git a/tests/ut/_310p/fused_moe/test_shared_fused_moe_310.py b/tests/ut/_310p/fused_moe/test_shared_fused_moe_310.py index 545dfe0d65d..2a97cf622c5 100644 --- a/tests/ut/_310p/fused_moe/test_shared_fused_moe_310.py +++ b/tests/ut/_310p/fused_moe/test_shared_fused_moe_310.py @@ -20,7 +20,6 @@ from vllm_ascend._310p.fused_moe.fused_moe import ( AscendFusedMoE310, - AscendSharedFusedMoE310, ) @@ -48,8 +47,8 @@ def forward(self, hidden_states: torch.Tensor): return out -def _build_layer(shared_experts: torch.nn.Module | None) -> AscendSharedFusedMoE310: - layer = AscendSharedFusedMoE310.__new__(AscendSharedFusedMoE310) +def _build_layer(shared_experts: torch.nn.Module | None) -> AscendFusedMoE310: + layer = AscendFusedMoE310.__new__(AscendFusedMoE310) # The test bypasses full layer init with __new__, so we must initialize # nn.Module internals before assigning child modules. torch.nn.Module.__init__(layer) @@ -80,7 +79,7 @@ def test_forward_impl_with_shared_experts_returns_tuple_310(): routed_out = torch.randn(3, 8) with patch.object(AscendFusedMoE310, "forward_impl", return_value=routed_out): - shared_out, routed = layer.forward_impl(hidden_states, router_logits) + shared_out, routed = layer.shared_forward_impl(hidden_states, router_logits) expected_shared = 0.5 * (hidden_states * 2.0 + 1.0) torch.testing.assert_close(shared_out, expected_shared) @@ -100,7 +99,7 @@ def test_forward_impl_without_shared_experts_returns_routed_only_310(): routed_out = torch.randn(3, 8) with patch.object(AscendFusedMoE310, "forward_impl", return_value=routed_out): - output = layer.forward_impl(hidden_states, router_logits) + output = layer.shared_forward_impl(hidden_states, router_logits) torch.testing.assert_close(output, routed_out) diff --git a/tests/ut/ops/test_fused_moe.py b/tests/ut/ops/test_fused_moe.py index f21d4dc82f8..fcb87262da0 100644 --- a/tests/ut/ops/test_fused_moe.py +++ b/tests/ut/ops/test_fused_moe.py @@ -236,6 +236,12 @@ def moe_method(mock_dist_env): return AscendUnquantizedFusedMoEMethod(moe) +def test_ascend_unquantized_skips_upstream_modular_kernel_init(): + method = AscendUnquantizedFusedMoEMethod.maybe_make_prepare_finalize + + assert method(object()) is None + + class Device(TypedDict): device_id: int device_expert: list[int] diff --git a/tests/ut/spec_decode/test_eagle_proposer.py b/tests/ut/spec_decode/test_eagle_proposer.py index 6a808e9154b..64bdb5a0830 100644 --- a/tests/ut/spec_decode/test_eagle_proposer.py +++ b/tests/ut/spec_decode/test_eagle_proposer.py @@ -17,6 +17,17 @@ from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.spec_decode.draft_proposer import AscendDraftModelProposer from vllm_ascend.spec_decode.eagle_proposer import AscendEagleProposer +from vllm_ascend.utils import vllm_version_is + +# vLLM #40732 moved `SpecDecodeBaseProposer` (and its `CpuGpuBuffer` import) +# out of `vllm.v1.spec_decode.eagle` into `vllm.v1.spec_decode.llm_base_proposer`. +# Pick the right patch path depending on the installed vllm version so the +# tests can mock the buffer factory. +_CPU_GPU_BUFFER_TARGET = ( + "vllm.v1.spec_decode.eagle.CpuGpuBuffer" + if vllm_version_is("0.19.1") + else "vllm.v1.spec_decode.llm_base_proposer.CpuGpuBuffer" +) class TestEagleProposerInitialization(TestBase): @@ -51,13 +62,15 @@ def setUp(self): self.vllm_config.parallel_config.enable_expert_parallel = False self.vllm_config.speculative_config.draft_tensor_parallel_size = 1 self.vllm_config.speculative_config.num_speculative_tokens = 2 + self.vllm_config.speculative_config.parallel_drafting = False self.vllm_config.speculative_config.speculative_token_tree = str([(i + 1) * (0,) for i in range(2)]) + self.vllm_config.speculative_config.draft_model_config.hf_config = MagicMock(spec=[]) self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0 self.vllm_config.speculative_config.draft_model_config.uses_mrope = False self.vllm_config.speculative_config.disable_padded_drafter_batch = False self.vllm_config.additional_config = None - self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer") + self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET) self.mock_cpugpubuffer.start() self.mock_supports_multimodal_inputs = patch( "vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False @@ -76,6 +89,7 @@ def tearDown(self): def test_initialization_eagle_graph(self): self.vllm_config.speculative_config.method = "eagle" self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096 + self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 4096 self.vllm_config.speculative_config.draft_model_config.uses_mrope = False self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE self.vllm_config.model_config.enforce_eager = False @@ -99,6 +113,7 @@ def test_initialization_eagle_graph(self): def test_initialization_eagle3_enforce_eager(self): self.vllm_config.speculative_config.method = "eagle3" self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048 + self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 2048 self.vllm_config.compilation_config.mode = CompilationMode.NONE self.vllm_config.compilation_config.pass_config = MagicMock() self.vllm_config.compilation_config.pass_config.enable_sp = False @@ -116,6 +131,7 @@ def test_initialization_eagle3_enforce_eager(self): def test_initialization_eagle3_full_graph_async(self): self.vllm_config.speculative_config.method = "eagle3" self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048 + self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 2048 self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE self.vllm_config.model_config.enforce_eager = False self.vllm_config.speculative_config.enforce_eager = False @@ -133,6 +149,7 @@ def test_initialization_eagle3_full_graph_async(self): def test_initialization_mtp_full_graph_async(self): self.vllm_config.speculative_config.method = "mtp" self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048 + self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 2048 self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE self.vllm_config.model_config.enforce_eager = False self.vllm_config.speculative_config.enforce_eager = False @@ -196,7 +213,7 @@ def setUp(self): self.vllm_config.additional_config = None init_ascend_config(self.vllm_config) - self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer") + self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET) self.mock_cpugpubuffer.start() self.mock_supports_multimodal_inputs = patch( "vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False @@ -332,7 +349,7 @@ def setUp(self): self.vllm_config.additional_config = None init_ascend_config(self.vllm_config) - self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer") + self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET) self.mock_cpugpubuffer.start() self.mock_supports_multimodal_inputs = patch( "vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False @@ -483,7 +500,7 @@ def setUp(self): self.vllm_config.additional_config = None init_ascend_config(self.vllm_config) - self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer") + self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET) self.mock_cpugpubuffer.start() self.mock_supports_multimodal_inputs = patch( "vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False @@ -558,7 +575,7 @@ def setUp_and_tearDown(self): self.vllm_config.additional_config = None init_ascend_config(self.vllm_config) - self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer") + self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET) self.mock_cpugpubuffer.start() self.mock_supports_multimodal_inputs = patch( "vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False @@ -1263,7 +1280,7 @@ def setUp(self): self.vllm_config.additional_config = None init_ascend_config(self.vllm_config) - self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer", MockCpuGpuBuffer) + self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET, MockCpuGpuBuffer) self.mock_cpugpubuffer.start() self.mock_supports_multimodal_inputs = patch( "vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False @@ -1747,6 +1764,7 @@ def setUp(self): self.vllm_config.speculative_config.use_local_argmax_reduction = False self.vllm_config.speculative_config.draft_tensor_parallel_size = 1 self.vllm_config.speculative_config.speculative_token_tree = str([(i + 1) * (0,) for i in range(3)]) + self.vllm_config.speculative_config.draft_model_config.hf_config = MagicMock(spec=[]) self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 4 self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 4 self.vllm_config.speculative_config.draft_model_config.uses_mrope = False @@ -1755,7 +1773,7 @@ def setUp(self): self.vllm_config.additional_config = None init_ascend_config(self.vllm_config) - self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer", MockCpuGpuBuffer) + self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET, MockCpuGpuBuffer) self.mock_cpugpubuffer.start() self.mock_supports_multimodal_inputs = patch( "vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False @@ -1876,7 +1894,14 @@ def check_mock(self): import vllm.v1.spec_decode.eagle - assert hasattr(vllm.v1.spec_decode.eagle, "CpuGpuBuffer") + # `CpuGpuBuffer` was re-exported from `eagle` until vLLM #40732 moved + # `SpecDecodeBaseProposer` (and the import) into `llm_base_proposer`. + if vllm_version_is("0.19.1"): + assert hasattr(vllm.v1.spec_decode.eagle, "CpuGpuBuffer") + else: + import vllm.v1.spec_decode.llm_base_proposer + + assert hasattr(vllm.v1.spec_decode.llm_base_proposer, "CpuGpuBuffer") RunnerCls = vllm.v1.spec_decode.eagle.SpecDecodeBaseProposer for attr in ("_get_positions", "_set_positions"): assert hasattr(RunnerCls, attr), f"SpecDecodeBaseProposer.{attr} not found" diff --git a/vllm_ascend/_310p/fused_moe/fused_moe.py b/vllm_ascend/_310p/fused_moe/fused_moe.py index 7c81d6a7336..5b06e9138bb 100644 --- a/vllm_ascend/_310p/fused_moe/fused_moe.py +++ b/vllm_ascend/_310p/fused_moe/fused_moe.py @@ -20,7 +20,6 @@ from vllm.distributed import get_dp_group, get_ep_group, get_tp_group from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod -from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE from vllm_ascend.ascend_forward_context import _EXTRA_CTX, MoECommType from vllm_ascend.ops.fused_moe.experts_selector import zero_experts_compute @@ -40,6 +39,11 @@ def __init__(self, moe: FusedMoEConfig = None): def is_monolithic(self) -> bool: return False + def maybe_make_prepare_finalize(self, routing_tables=None): + # Ascend 310P uses its own MoE communication and forward_impl path. + # Do not let upstream modular-kernel initialization replace it. + return None + def process_weights_after_loading(self, layer): super().process_weights_after_loading(layer) @@ -119,6 +123,8 @@ class AscendFusedMoE310(FusedMoE): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self._routed_input_transform = kwargs.get("routed_input_transform") + self._shared_experts = kwargs.get("shared_experts") self.global_num_experts = kwargs["num_experts"] if self.quant_config is None: @@ -127,6 +133,10 @@ def __init__(self, *args, **kwargs): self.quant_method = self.quant_config.get_quant_method(self, self.layer_name) assert self.quant_method is not None + # Keep base_quant_method aligned with the Ascend-replaced quant_method + # so FusedMoE.maybe_init_modular_kernel doesn't dispatch into the + # upstream UnquantizedFusedMoEMethod.maybe_make_prepare_finalize. + self.base_quant_method = self.quant_method self.moe_config.tp_group = get_tp_group() self.moe_config.dp_group = get_dp_group() @@ -175,6 +185,11 @@ def __init__(self, *args, **kwargs): self.vllm_config.parallel_config.enable_dbo, ) + @property + def is_internal_router(self) -> bool: + # 310P Ascend path expects router logits from the model forward path. + return False + def init_experts_map(self, moe_config): """ Initialize expert mapping for MoE (Mixture of Experts) model. @@ -260,68 +275,12 @@ def forward_impl( # type: ignore[override] return routed_out - -class AscendSharedFusedMoE310(SharedFusedMoE, AscendFusedMoE310): - def __init__( - self, - shared_experts: torch.nn.Module, - gate: torch.nn.Module | None = None, - use_overlapped: bool = True, - routed_input_transform: torch.nn.Module | None = None, - **kwargs, - ): - AscendFusedMoE310.__init__(self, **kwargs) - self._routed_input_transform = routed_input_transform - self._shared_experts = shared_experts - self.use_overlapped = use_overlapped - self.shared_expert_stream = None - self._gate = gate - # Recreate runner after shared_experts/gate are set so custom op dispatch - # goes through moe_forward_shared. - # NOTE: must use self._shared_experts here, not self.shared_experts — - # FusedMoE.shared_experts is a property that reads self.runner.shared_experts, - # which at this point is still the stale runner built with shared_experts=None. - from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner - - self.runner = AscendMoERunner( - self.layer_name, - self.moe_config, - self.router, - self._routed_input_transform, - self._gate, - self._shared_experts, - self.quant_method, - self.reduce_results, - self.vllm_config.parallel_config.enable_dbo, - ) - - @property - def is_internal_router(self) -> bool: - # 310P Ascend path expects router logits from the model forward path. - return False - - def forward( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - result = AscendFusedMoE310.forward( - self, - hidden_states=hidden_states, - router_logits=router_logits, - ) - # When shared experts are absent, the parent returns only fused_out; - # otherwise it returns a (shared_out, fused_out) tuple. - if self._shared_experts is None: - return None, result - return result - def _forward_shared_experts(self, hidden_states: torch.Tensor): if self._shared_experts is None: return None return self._shared_experts(hidden_states) - def forward_impl( # type: ignore[override] + def shared_forward_impl( # type: ignore[override] self, hidden_states: torch.Tensor, router_logits: torch.Tensor ): routed_out = AscendFusedMoE310.forward_impl( diff --git a/vllm_ascend/__init__.py b/vllm_ascend/__init__.py index c5b7d1b0445..e90b796b4e1 100644 --- a/vllm_ascend/__init__.py +++ b/vllm_ascend/__init__.py @@ -15,6 +15,25 @@ # This file is a part of the vllm-ascend project. # +_GLOBAL_PATCH_APPLIED = False + + +def _ensure_global_patch(): + """Apply process-wide vLLM patches before engine-core initialization. + + vLLM loads general plugins in engine-core subprocesses. E2E test + conftest hooks do not run there, so global patches that affect scheduler + and engine code must also be applied through these plugin entry points. + """ + global _GLOBAL_PATCH_APPLIED + if _GLOBAL_PATCH_APPLIED: + return + + from vllm_ascend.utils import adapt_patch + + adapt_patch(is_global_patch=True) + _GLOBAL_PATCH_APPLIED = True + def register(): """Register the NPU platform.""" @@ -23,12 +42,16 @@ def register(): def register_connector(): + _ensure_global_patch() + from vllm_ascend.distributed.kv_transfer import register_connector register_connector() def register_model_loader(): + _ensure_global_patch() + from .model_loader.netloader import register_netloader from .model_loader.rfork import register_rforkloader @@ -37,6 +60,8 @@ def register_model_loader(): def register_service_profiling(): + _ensure_global_patch() + from .profiling_config import generate_service_profiling_config generate_service_profiling_config() diff --git a/vllm_ascend/core/scheduler_profiling_chunk.py b/vllm_ascend/core/scheduler_profiling_chunk.py index d66edd42767..ef8285a77e0 100644 --- a/vllm_ascend/core/scheduler_profiling_chunk.py +++ b/vllm_ascend/core/scheduler_profiling_chunk.py @@ -59,6 +59,9 @@ def __init__( kv_cache_config: KVCacheConfig, structured_output_manager: StructuredOutputManager, block_size: int, + # `hash_block_size` was added in vLLM #40946; keep it optional so the + # subclass works on both pinned vllm and main. + hash_block_size: int | None = None, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, include_finished_set: bool = False, log_stats: bool = False, @@ -68,6 +71,7 @@ def __init__( kv_cache_config, structured_output_manager, block_size, + hash_block_size=hash_block_size, mm_registry=mm_registry, include_finished_set=include_finished_set, log_stats=log_stats, diff --git a/vllm_ascend/lora/utils.py b/vllm_ascend/lora/utils.py index d822b362ee4..5f3ab650f54 100755 --- a/vllm_ascend/lora/utils.py +++ b/vllm_ascend/lora/utils.py @@ -26,6 +26,7 @@ AscendRowParallelLinear, ) from vllm_ascend.ops.vocab_parallel_embedding import AscendVocabParallelEmbedding +from vllm_ascend.utils import vllm_version_is class AscendColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): @@ -185,15 +186,27 @@ def can_replace_layer( def refresh_all_lora_classes(): - vllm.lora.utils._all_lora_classes.add(AscendColumnParallelLinearWithLoRA) - vllm.lora.utils._all_lora_classes.add(AscendMergedColumnParallelLinearWithLoRA) - vllm.lora.utils._all_lora_classes.add(AscendRowParallelLinearWithLoRA) - vllm.lora.utils._all_lora_classes.add(AscendVocabParallelEmbeddingWithLoRA) - vllm.lora.utils._all_lora_classes.add(AscendQKVParallelLinearWithLoRA) - vllm.lora.utils._all_lora_classes.add(AscendMergedQKVParallelLinearWithLoRA) - vllm.lora.utils._all_lora_classes.add(AscendColumnParallelLinearWithShardedLoRA) - vllm.lora.utils._all_lora_classes.add(AscendMergedColumnParallelLinearWithShardedLoRA) - vllm.lora.utils._all_lora_classes.add(AscendMergedQKVParallelLinearWithShardedLoRA) - vllm.lora.utils._all_lora_classes.add(AscendQKVParallelLinearWithShardedLoRA) - vllm.lora.utils._all_lora_classes.add(AscendRowParallelLinearWithShardedLoRA) - vllm.lora.utils._all_lora_classes.add(AscendReplicatedLinearWithLoRA) + ascend_classes = ( + AscendColumnParallelLinearWithLoRA, + AscendMergedColumnParallelLinearWithLoRA, + AscendRowParallelLinearWithLoRA, + AscendVocabParallelEmbeddingWithLoRA, + AscendQKVParallelLinearWithLoRA, + AscendMergedQKVParallelLinearWithLoRA, + AscendColumnParallelLinearWithShardedLoRA, + AscendMergedColumnParallelLinearWithShardedLoRA, + AscendMergedQKVParallelLinearWithShardedLoRA, + AscendQKVParallelLinearWithShardedLoRA, + AscendRowParallelLinearWithShardedLoRA, + AscendReplicatedLinearWithLoRA, + ) + if vllm_version_is("0.19.1"): + for cls in ascend_classes: + vllm.lora.utils._all_lora_classes.add(cls) + else: + # vLLM #35077 changed _all_lora_classes from set to ordered tuple. + # Append the Ascend classes in a deterministic order. + vllm.lora.utils._all_lora_classes = ( + *vllm.lora.utils._all_lora_classes, + *ascend_classes, + ) diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index f9f993fd077..b0c5c508132 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -21,7 +21,6 @@ import torch import torch.nn.functional as F import torch_npu -from vllm._aiter_ops import rocm_aiter_ops from vllm.config import get_current_vllm_config from vllm.distributed import get_dp_group, get_ep_group, get_tp_group, tensor_model_parallel_all_reduce from vllm.forward_context import get_forward_context @@ -29,8 +28,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsCapturer -from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner # type: ignore -from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner # type: ignore import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config @@ -84,6 +82,11 @@ def __init__(self, moe: FusedMoEConfig = None): def is_monolithic(self) -> bool: return False + def maybe_make_prepare_finalize(self, routing_tables=None): + # Ascend uses its own MoE communication and forward_impl path. + # Do not let upstream modular-kernel initialization replace it. + return None + def process_weights_after_loading(self, layer): super(UnquantizedFusedMoEMethod, self).process_weights_after_loading(layer) @@ -229,8 +232,7 @@ def apply( return final_hidden_states -# Please remove this inheritance after extending vllm, todo(wxs) -class AscendMoERunner(DefaultMoERunner): +class AscendMoERunner(MoERunner): @property def use_dp_chunking(self) -> bool: """Ascend uses its own forward_impl path, not the FlashInfer Cutlass @@ -250,18 +252,22 @@ def forward_impl( This delegates to the layer's forward_impl method which contains the Ascend-specific MoE computation logic. """ - result = layer.forward_impl(hidden_states, router_logits) - # If the layer has shared experts, forward_impl returns a tuple (shared_out, routed_out) - # Otherwise, it returns just routed_out - # The torch op expects the same return type based on whether it's moe_forward or moe_forward_shared + if self.shared_experts is None: + result = layer.forward_impl(hidden_states, router_logits) + # If the layer has shared experts, forward_impl returns a tuple (shared_out, routed_out) + # Otherwise, it returns just routed_out + # The torch op expects the same return type based on whether it's moe_forward or moe_forward_shared + else: + result = layer.shared_forward_impl(hidden_states, router_logits) return result - def forward_dispatch( + def _forward_impl( self, layer: torch.nn.Module, hidden_states: torch.Tensor, router_logits: torch.Tensor, shared_experts_input: torch.Tensor | None, + input_ids: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: with self._sequence_parallel_context(): return self.forward_impl( @@ -278,7 +284,11 @@ class AscendFusedMoE(FusedMoE): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - + self.use_overlapped = True + self._routed_input_transform = kwargs.get("routed_input_transform") + self._shared_experts = kwargs.get("shared_experts") + self.shared_expert_stream = None + has_shared_experts = self._shared_experts is not None num_experts = kwargs["num_experts"] intermediate_size = kwargs["intermediate_size"] num_shared_experts = kwargs.get("n_shared_experts", 0) @@ -295,6 +305,12 @@ def __init__(self, *args, **kwargs): self.quant_method = self.quant_config.get_quant_method(self, self.layer_name) assert self.quant_method is not None + # Keep base_quant_method in sync with the swapped-in Ascend method, + # otherwise FusedMoE.maybe_init_modular_kernel (called via the V2 + # model runner's prepare_communication_buffer_for_model) would dispatch + # to the upstream UnquantizedFusedMoEMethod.maybe_make_prepare_finalize, + # which raises by design. + self.base_quant_method = self.quant_method self.moe_config.tp_group = get_tp_group() self.moe_config.dp_group = get_dp_group() @@ -302,6 +318,11 @@ def __init__(self, *args, **kwargs): self.moe_config.mc2_group = get_mc2_group() self.moe_config.supports_eplb = self.quant_method.supports_eplb ascend_config = get_ascend_config() + self.multistream_overlap_shared_expert = ascend_config.multistream_overlap_shared_expert and has_shared_experts + self.shared_multistream_overlap_gate = ascend_config.multistream_overlap_gate and has_shared_experts + if enable_sp() and has_shared_experts: + logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.") + # flashcommon3 gate stream self.multistream_overlap_gate = ascend_config.multistream_overlap_gate if self.multistream_overlap_gate and AscendFusedMoE.gate_stream is None: @@ -311,6 +332,7 @@ def __init__(self, *args, **kwargs): self.e_score_correction_bias.data = self.e_score_correction_bias.data.to( dtype=vllm_config.model_config.dtype ) + self._gate = kwargs.get("gate") # init moe eplb_config = ascend_config.eplb_config @@ -374,10 +396,62 @@ def __init__(self, *args, **kwargs): kwargs.pop("gate", None), kwargs.pop("shared_experts", None), self.quant_method, - self.reduce_results, self.vllm_config.parallel_config.enable_dbo, ) + if self.multistream_overlap_shared_expert: + # Wrap the quant_method's process_weights_after_loading to validate that + # splitting shared expert computation (gate_up projection + activation, + # then down projection) yields identical results to integrated + # computation after weight loading. + original_process_weights = self.quant_method.process_weights_after_loading + + @wraps(original_process_weights) + def wrapped_process_weights(*args, **kwargs): + result = original_process_weights(*args, **kwargs) + self._validate_shared_expert_consistency() + return result + + self.quant_method.process_weights_after_loading = wrapped_process_weights # type: ignore + + def _validate_shared_expert_consistency(self): + """Validate that split shared expert computation matches integrated + computation.""" + test_input = ( + torch.rand(10, self.hidden_size, device="npu", dtype=self.moe_config.in_dtype) * 2 - 1 + ) # Random input for testing, scoped to [-1, 1] + + assert self._shared_experts is not None + integrated_out = self._shared_experts(test_input) + part1_out = self._shared_experts_part1(test_input) + split_out = self._shared_experts_part2(test_input, part1_out) + + if not torch.allclose(integrated_out, split_out): + diff = (integrated_out - split_out).abs() + logger.error("FusedMoE shared experts split computation does not match the integrated computation.") + logger.error("Max absolute difference: %s", diff.max().item()) + logger.error( + "Integrated output - sum: %s, norm: %s", integrated_out.sum().item(), integrated_out.norm().item() + ) + logger.error("Split output - sum: %s, norm: %s", split_out.sum().item(), split_out.norm().item()) + raise ValueError("FusedMoE shared experts split computation does not match the integrated computation.") + logger.info_once("FusedMoE shared experts split computation matches the integrated computation.") + + def _shared_experts_part1(self, hidden_states: torch.Tensor): + shared_gate_up, _ = self._shared_experts.gate_up_proj(hidden_states) # type: ignore + return shared_gate_up + + def _shared_experts_part2(self, hidden_states: torch.Tensor, shared_gate_up: torch.Tensor): + shared_act = self._shared_experts.act_fn(shared_gate_up) # type: ignore + shared_out, _ = self._shared_experts.down_proj(shared_act) # type: ignore + + # Qwen3-Next specific gating mechanism + assert self._shared_experts is not None + if hasattr(self._shared_experts, "expert_gate") and self._shared_experts.expert_gate is not None: + gate_out, _ = self._shared_experts.expert_gate(hidden_states) # type: ignore + shared_out = F.sigmoid(gate_out) * shared_out + return shared_out + def _get_quant_type(self) -> QuantType: quant_type = QuantType.NONE method = getattr(self.quant_method, "quant_method", None) @@ -408,6 +482,21 @@ def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tens """ return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(final_hidden_states) + @property + def gate(self) -> torch.nn.Module | None: + return self._gate if self.use_overlapped else None + + @property + def is_internal_router(self) -> bool: + return False + + @property + def use_dp_chunking(self) -> bool: + """This func routes to the chunked forward path using the FlashInfer Cutlass kernel + only when data parallelism (DP) is enabled. Thus just returning False in vllm-ascend + """ + return False + def forward( self, hidden_states: torch.Tensor, @@ -537,9 +626,10 @@ def forward_impl( # type: ignore[override] self.load_counter.add_(1) else: self.moe_load.add_(local_load) + routed_out = _EXTRA_CTX.moe_comm_method.finalize( hidden_states=fused_experts_results.routed_out, - reduce_results=self.reduce_results, + reduce_results=isinstance(_EXTRA_CTX.moe_comm_method, AllGatherCommImpl), padded_hidden_states_shape=padded_hidden_states_shape, ) @@ -553,140 +643,6 @@ def forward_impl( # type: ignore[override] # The vLLM FusedMoE forward_impl does not return events. return routed_out - -class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE): - def __init__( - self, - shared_experts: torch.nn.Module, - gate: torch.nn.Module | None = None, - use_overlapped: bool = True, - routed_input_transform: torch.nn.Module | None = None, - **kwargs, - ): - ascend_config = get_ascend_config() - # TODO: Enabling the mix placement in deepseek_v2.py - # remove this part after the mix placement merged into vllm - # https://github.com/vllm-project/vllm/pull/31256 - if ascend_config.mix_placement: - rocm_aiter_ops.is_fusion_moe_shared_experts_enabled = mock_false - rocm_aiter_ops.is_fused_moe_enabled = mock_false - AscendFusedMoE.__init__(self, **kwargs) - if ascend_config.mix_placement: - rocm_aiter_ops.is_fusion_moe_shared_experts_enabled = mock_true - rocm_aiter_ops.is_fused_moe_enabled = mock_true - - self._routed_input_transform = routed_input_transform - self._shared_experts = shared_experts - self.use_overlapped = use_overlapped - self.shared_expert_stream = None - has_shared_experts = shared_experts is not None - self.multistream_overlap_shared_expert = ascend_config.multistream_overlap_shared_expert and has_shared_experts - self.multistream_overlap_gate = ascend_config.multistream_overlap_gate and has_shared_experts - if enable_sp(): - logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.") - - self._gate = gate - # Recreate the runner with the correct shared_experts parameter. - # The parent class created the runner before self._shared_experts was set. - # NOTE: must use self._shared_experts here, not self.shared_experts — - # FusedMoE.shared_experts is a property that reads self.runner.shared_experts, - # which at this point is still the stale runner built with shared_experts=None. - self.runner = AscendMoERunner( - self.layer_name, - self.moe_config, - self.router, - self._routed_input_transform, - self.gate, - self._shared_experts, - self.quant_method, - self.reduce_results, - self.vllm_config.parallel_config.enable_dbo, - ) - - if self.multistream_overlap_shared_expert: - # Wrap the quant_method's process_weights_after_loading to validate that - # splitting shared expert computation (gate_up projection + activation, - # then down projection) yields identical results to integrated - # computation after weight loading. - original_process_weights = self.quant_method.process_weights_after_loading - - @wraps(original_process_weights) - def wrapped_process_weights(*args, **kwargs): - result = original_process_weights(*args, **kwargs) - self._validate_shared_expert_consistency() - return result - - self.quant_method.process_weights_after_loading = wrapped_process_weights # type: ignore - - def _shared_experts_part1(self, hidden_states: torch.Tensor): - shared_gate_up, _ = self._shared_experts.gate_up_proj(hidden_states) # type: ignore - return shared_gate_up - - def _shared_experts_part2(self, hidden_states: torch.Tensor, shared_gate_up: torch.Tensor): - shared_act = self._shared_experts.act_fn(shared_gate_up) # type: ignore - shared_out, _ = self._shared_experts.down_proj(shared_act) # type: ignore - - # Qwen3-Next specific gating mechanism - if hasattr(self._shared_experts, "expert_gate") and self._shared_experts.expert_gate is not None: - gate_out, _ = self._shared_experts.expert_gate(hidden_states) # type: ignore - shared_out = F.sigmoid(gate_out) * shared_out - return shared_out - - def _validate_shared_expert_consistency(self): - """Validate that split shared expert computation matches integrated - computation.""" - test_input = ( - torch.rand(10, self.hidden_size, device="npu", dtype=self.moe_config.in_dtype) * 2 - 1 - ) # Random input for testing, scoped to [-1, 1] - - integrated_out = self._shared_experts(test_input) - part1_out = self._shared_experts_part1(test_input) - split_out = self._shared_experts_part2(test_input, part1_out) - - if not torch.allclose(integrated_out, split_out): - diff = (integrated_out - split_out).abs() - logger.error("SharedFusedMoE shared experts split computation does not match the integrated computation.") - logger.error("Max absolute difference: %s", diff.max().item()) - logger.error( - "Integrated output - sum: %s, norm: %s", integrated_out.sum().item(), integrated_out.norm().item() - ) - logger.error("Split output - sum: %s, norm: %s", split_out.sum().item(), split_out.norm().item()) - raise ValueError( - "SharedFusedMoE shared experts split computation does not match the integrated computation." - ) - logger.info_once("SharedFusedMoE shared experts split computation matches the integrated computation.") - - @property - def gate(self) -> torch.nn.Module | None: - return self._gate if self.use_overlapped else None - - @property - def is_internal_router(self) -> bool: - return False - - @property - def use_dp_chunking(self) -> bool: - """This func routes to the chunked forward path using the FlashInfer Cutlass kernel - only when data parallelism (DP) is enabled. Thus just returning False in vllm-ascend - """ - return False - - def forward( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - result = AscendFusedMoE.forward( - self, - hidden_states=hidden_states, - router_logits=router_logits, - ) - # When shared experts are absent, the parent returns only fused_out; - # otherwise it returns a (shared_out, fused_out) tuple. - if self._shared_experts is None: - return None, result - return result - def _forward_shared_experts(self, hidden_states: torch.Tensor, fused_moe_evts: FusedMoEEvents): if self._shared_experts is None: return None @@ -722,15 +678,15 @@ def maybe_wait_event(evt: torch.npu.Event | None): shared_out = tensor_model_parallel_all_reduce(shared_out) return shared_out - def forward_impl( # type: ignore[override] + def shared_forward_impl( # type: ignore[override] self, hidden_states: torch.Tensor, router_logits: torch.Tensor ): - if self.multistream_overlap_gate: + if self.shared_multistream_overlap_gate: set_flash_common3_context(shared_experts=self._shared_experts) before_routed_experts = torch.npu.current_stream().record_event() - fused_moe_results = AscendFusedMoE.forward_impl( - self, + + fused_moe_results = self.forward_impl( hidden_states=hidden_states, router_logits=router_logits, return_with_event=True, @@ -740,7 +696,7 @@ def forward_impl( # type: ignore[override] if self._shared_experts is None: return routed_out - if self.multistream_overlap_gate: + if self.shared_multistream_overlap_gate: fc3_context = get_flash_common3_context() assert fc3_context is not None shared_out = fc3_context.shared_out @@ -753,5 +709,4 @@ def forward_impl( # type: ignore[override] before_combine=fused_moe_results.before_combine_evt, ), ) - return shared_out, routed_out diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py index 013076ebc88..4433d521371 100644 --- a/vllm_ascend/ops/layernorm.py +++ b/vllm_ascend/ops/layernorm.py @@ -166,12 +166,25 @@ def __init__( norm_before_gate: bool = False, device: torch.device | None = None, dtype: torch.dtype | None = None, + # `activation` was added in vLLM #40245 (Qwen3-Next/GDN). Accept and + # forward it; older vllm versions did not pass this kwarg so the + # default keeps existing behavior. + activation: str = "swish", ): """If group_size is not None, we do GroupNorm with each group having group_size elements. group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group). """ factory_kwargs = {"device": device, "dtype": dtype} - super().__init__(hidden_size, eps, group_size, norm_before_gate, device, dtype) + super().__init__( + hidden_size, + eps, + group_size, + norm_before_gate, + device, + dtype, + activation=activation, + ) + self.eps = eps self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) self.register_parameter("bias", None) diff --git a/vllm_ascend/ops/mla.py b/vllm_ascend/ops/mla.py index 047b27eea9e..67886b77b23 100644 --- a/vllm_ascend/ops/mla.py +++ b/vllm_ascend/ops/mla.py @@ -79,6 +79,7 @@ def __init__( cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", + skip_topk: bool = False, ) -> None: nn.Module.__init__(self) self.hidden_size = hidden_size @@ -93,6 +94,7 @@ def __init__( self.enable_shared_expert_dp = get_ascend_config().enable_shared_expert_dp self.tp_size = get_tensor_model_parallel_world_size() self.layers = hf_config.num_hidden_layers + self.skip_topk = skip_topk if mla_modules.indexer is not None: ascend_indexer = IndexerWrapper(mla_modules.indexer) else: diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py index 9a793352e2f..2c92fb86631 100644 --- a/vllm_ascend/patch/platform/__init__.py +++ b/vllm_ascend/patch/platform/__init__.py @@ -18,6 +18,8 @@ import vllm_ascend.patch.platform.patch_distributed # noqa import vllm_ascend.patch.platform.patch_kv_cache_interface # noqa +import vllm_ascend.patch.platform.patch_kv_cache_utils # noqa +import vllm_ascend.patch.platform.patch_mla_prefill_backend # noqa from vllm_ascend import envs from vllm_ascend.utils import is_310p diff --git a/vllm_ascend/patch/platform/patch_kv_cache_utils.py b/vllm_ascend/patch/platform/patch_kv_cache_utils.py new file mode 100644 index 00000000000..20c09004412 --- /dev/null +++ b/vllm_ascend/patch/platform/patch_kv_cache_utils.py @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM-Ascend project +import math + +import vllm.v1.core.kv_cache_utils +from vllm.config import VllmConfig +from vllm.v1.kv_cache_interface import KVCacheConfig + +_orig_resolve_kv_cache_block_sizes = vllm.v1.core.kv_cache_utils.resolve_kv_cache_block_sizes + + +def _ascend_resolve_kv_cache_block_sizes( + kv_cache_config: KVCacheConfig, + vllm_config: VllmConfig, +) -> tuple[int, int]: + """Ascend-compatible resolve_kv_cache_block_sizes. + + vLLM PR #40860 added a restriction that hybrid KV cache groups with + multiple block sizes do not support context parallelism (dcp/pcp > 1). + This restriction is correct for CUDA but not for Ascend, which implements + context parallelism for MLA and SWA-MLA layers independently. + + For multiple KV cache groups with CP, compute scheduler_block_size as + lcm(group_block_sizes) * dcp * pcp to maintain alignment, consistent + with the pre-PR-#40860 behavior of block_size * dcp * pcp. + """ + cache_config = vllm_config.cache_config + dcp = vllm_config.parallel_config.decode_context_parallel_size + pcp = vllm_config.parallel_config.prefill_context_parallel_size + groups = kv_cache_config.kv_cache_groups + + if len(groups) <= 1: + bs = cache_config.block_size * dcp * pcp + return bs, bs + + if dcp != 1 or pcp != 1: + # Ascend supports CP with multiple KV cache groups; compute + # scheduler_block_size using the LCM of all group block sizes + # multiplied by the CP factors for proper alignment. + group_block_sizes = [g.kv_cache_spec.block_size for g in groups] + scheduler_block_size = math.lcm(*group_block_sizes) * dcp * pcp + return scheduler_block_size, scheduler_block_size + + return _orig_resolve_kv_cache_block_sizes(kv_cache_config, vllm_config) + + +vllm.v1.core.kv_cache_utils.resolve_kv_cache_block_sizes = _ascend_resolve_kv_cache_block_sizes + +# Also patch the reference used by engine/core.py which imports the function directly. +import vllm.v1.engine.core # noqa: E402 + +vllm.v1.engine.core.resolve_kv_cache_block_sizes = _ascend_resolve_kv_cache_block_sizes diff --git a/vllm_ascend/patch/platform/patch_mla_prefill_backend.py b/vllm_ascend/patch/platform/patch_mla_prefill_backend.py new file mode 100644 index 00000000000..c904575cb3f --- /dev/null +++ b/vllm_ascend/patch/platform/patch_mla_prefill_backend.py @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# PR vllm-project/vllm#32623 introduced a new MLAPrefillBackend abstraction. +# When MLAAttention.__init__ calls get_mla_prefill_backend(), the upstream +# selector sees that Ascend NPU returns None for get_device_capability() and +# falls back to FlashAttnPrefillBackend, which asserts flash_attn_varlen_func +# is available — crashing on Ascend. +# +# Ascend's AscendSFAImpl/AscendMLAImpl handles the full forward pass (including +# prefill) via impl.forward(), so prefill_backend.run_prefill_* is never called. +# We register a no-op AscendMLAPrefillBackend and patch get_mla_prefill_backend +# so that MLAAttention.__init__ completes without error. + +import torch +import vllm.model_executor.layers.attention.mla_attention + +from vllm_ascend.utils import vllm_version_is + +if not vllm_version_is("0.20.1"): + from vllm.v1.attention.backends.mla.prefill.base import MLAPrefillBackend + + class AscendMLAPrefillBackend(MLAPrefillBackend): + @staticmethod + def get_name() -> str: + return "ASCEND" + + @classmethod + def is_available(cls) -> bool: + return True + + def run_prefill_new_tokens( + self, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + return_softmax_lse: bool, + ) -> torch.Tensor: + raise NotImplementedError("Ascend MLA prefill is handled by AscendSFAImpl/AscendMLAImpl") + + def run_prefill_context_chunk( + self, + chunk_idx: int, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + raise NotImplementedError("Ascend MLA prefill is handled by AscendSFAImpl/AscendMLAImpl") + + vllm.model_executor.layers.attention.mla_attention.get_mla_prefill_backend = ( + lambda vllm_config: AscendMLAPrefillBackend + ) diff --git a/vllm_ascend/sample/rejection_sampler.py b/vllm_ascend/sample/rejection_sampler.py index 358c294ebe0..26c0bf395ac 100644 --- a/vllm_ascend/sample/rejection_sampler.py +++ b/vllm_ascend/sample/rejection_sampler.py @@ -94,6 +94,8 @@ def rejection_sample( # [batch_size, 1] bonus_token_ids: torch.Tensor, sampling_metadata: SamplingMetadata, + synthetic_mode: bool = False, + synthetic_conditional_rates: torch.Tensor | None = None, ) -> torch.Tensor: assert draft_token_ids.ndim == 1 assert draft_probs is None or draft_probs.ndim == 2 diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index b39a9f27cce..bdf4a9e1945 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -624,7 +624,7 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None): from vllm_ascend.ops.activation import AscendQuickGELU, AscendSiluAndMul from vllm_ascend.ops.conv import AscendConv3dLayer - from vllm_ascend.ops.fused_moe.fused_moe import AscendFusedMoE, AscendSharedFusedMoE + from vllm_ascend.ops.fused_moe.fused_moe import AscendFusedMoE from vllm_ascend.ops.gdn import AscendGatedDeltaNetAttention from vllm_ascend.ops.layernorm import AscendGemmaRMSNorm, AscendRMSNorm, AscendRMSNormGated from vllm_ascend.ops.linear import ( @@ -670,7 +670,6 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None): "RMSNorm": AscendRMSNorm, "GemmaRMSNorm": AscendGemmaRMSNorm, "FusedMoE": AscendFusedMoE, - "SharedFusedMoE": AscendSharedFusedMoE, "MultiHeadLatentAttentionWrapper": AscendMultiHeadLatentAttention, "MMEncoderAttention": AscendMMEncoderAttention, "ApplyRotaryEmb": AscendApplyRotaryEmb, @@ -683,7 +682,7 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None): # 310P: override selected ops with 310P implementations (keep minimal changes outside _310p) if is_310p(): - from vllm_ascend._310p.fused_moe.fused_moe import AscendFusedMoE310, AscendSharedFusedMoE310 + from vllm_ascend._310p.fused_moe.fused_moe import AscendFusedMoE310 from vllm_ascend._310p.ops.activation import AscendSiluAndMul310 from vllm_ascend._310p.ops.conv import AscendConv3dLayer310 from vllm_ascend._310p.ops.fla.gdn_310 import AscendGatedDeltaNetAttention310 @@ -707,7 +706,6 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None): "GemmaRMSNorm": AscendGemmaRMSNorm310, "RMSNormGated": AscendRMSNormGated310, "FusedMoE": AscendFusedMoE310, - "SharedFusedMoE": AscendSharedFusedMoE310, "ParallelLMHead": AscendParallelLMHead310, "VocabParallelEmbedding": AscendVocabParallelEmbedding310, "MMEncoderAttention": AscendMMEncoderAttention310, diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py index 8600465ad0d..fa5f4443298 100644 --- a/vllm_ascend/worker/npu_input_batch.py +++ b/vllm_ascend/worker/npu_input_batch.py @@ -50,6 +50,11 @@ def __init__( ): self.is_pooling_model = is_pooling_model self.is_spec_decode = is_spec_decode + # Added for compatibility with InputBatch methods that reference these + # attributes after PR vllm-project/vllm#34668. NPU does not use + # thinking budget, so the holder is always None. + self.thinking_budget_state_holder = None + self.thinking_token_budget_reqs: set[str] = set() self.max_num_reqs = max_num_reqs self.max_model_len = max_model_len self.max_num_batched_tokens = max_num_batched_tokens diff --git a/vllm_ascend/worker/v2/model_runner.py b/vllm_ascend/worker/v2/model_runner.py index 1616843a1d0..a2903dca9c4 100644 --- a/vllm_ascend/worker/v2/model_runner.py +++ b/vllm_ascend/worker/v2/model_runner.py @@ -304,6 +304,16 @@ def prepare_inputs( input_ids = self.input_buffers.input_ids[:num_tokens_after_padding] positions = self.input_buffers.positions[:num_tokens_after_padding] + # CPU upper bound on seq_lens (num_computed_tokens + num_scheduled_tokens). + # Added by vLLM PR #40654 to avoid GPU->CPU sync for seq_lens. + seq_lens_cpu_upper_bound_np = np.zeros(num_reqs_padded, dtype=np.int32) + np.add( + self.req_states.num_computed_tokens_np[idx_mapping_np], + num_scheduled_tokens, + out=seq_lens_cpu_upper_bound_np[:num_reqs], + ) + seq_lens_cpu_upper_bound = torch.from_numpy(seq_lens_cpu_upper_bound_np) + self.input_batch = AscendInputBatch( req_ids=req_ids, num_reqs=num_reqs, @@ -319,6 +329,7 @@ def prepare_inputs( query_start_loc=query_start_loc, query_start_loc_np=query_start_loc_np, seq_lens=seq_lens, + seq_lens_cpu_upper_bound=seq_lens_cpu_upper_bound, dcp_local_seq_lens=None, # TODO(Ronald1995): support cp. input_ids=input_ids, positions=positions, diff --git a/vllm_ascend/worker/v2/sample/logprob.py b/vllm_ascend/worker/v2/sample/logprob.py index f157d58cc3f..b983da1a784 100644 --- a/vllm_ascend/worker/v2/sample/logprob.py +++ b/vllm_ascend/worker/v2/sample/logprob.py @@ -22,6 +22,10 @@ from vllm.v1.outputs import LogprobsTensors from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num +from vllm_ascend.utils import vllm_version_is + +if not vllm_version_is("0.20.1"): + from vllm.v1.worker.gpu.sample.logprob import LogprobTokenIdsState @triton.jit @@ -115,50 +119,104 @@ def _ranks_kernel( tl.store(output_ptr + req_idx, n) -def compute_topk_logprobs( - logits: torch.Tensor, - num_logprobs: int, - sampled_token_ids: torch.Tensor, - cu_num_logits: list[int] | None = None, -) -> LogprobsTensors: - assert num_logprobs >= 0 - batch_size, vocab_size = logits.shape - logprob_token_ids = sampled_token_ids.unsqueeze(-1) - if num_logprobs > 0: - topk_indices = torch.topk(logits, num_logprobs, dim=-1).indices - logprob_token_ids = torch.cat((sampled_token_ids.unsqueeze(-1), topk_indices), dim=1) - - # NOTE(woosuk): Here, to save GPU memory, we do not materialize the full - # logprobs tensor. Instead, we only compute and return the logprobs of - # the topk + 1 tokens. - logprobs = compute_token_logprobs(logits, logprob_token_ids) - token_ranks = torch.empty( - batch_size, - dtype=torch.int64, - device=logits.device, - ) - - vec_core = get_vectorcore_num() - NUM_CORES = min(batch_size, vec_core) - - rows_per_core = triton.cdiv(batch_size, NUM_CORES) - BLOCK_SIZE = 8192 - grid = (NUM_CORES,) - _ranks_kernel[grid]( - token_ranks, - logits, - logits.stride(0), - sampled_token_ids, - vocab_size, - batch_size, - rows_per_core, - BLOCK_SIZE=BLOCK_SIZE, - multibuffer=False, - ) - - return LogprobsTensors( - logprob_token_ids=logprob_token_ids, - logprobs=logprobs, - selected_token_ranks=token_ranks, - cu_num_generated_tokens=cu_num_logits, - ) +if not vllm_version_is("0.20.1"): + + def compute_topk_logprobs( + logits: torch.Tensor, + num_logprobs: int, + sampled_token_ids: torch.Tensor, + cu_num_logits: list[int] | None = None, + logprob_token_ids_state: "LogprobTokenIdsState | None" = None, + expanded_idx_mapping: torch.Tensor | None = None, + max_per_req_token_ids: int = 0, + ) -> LogprobsTensors: + assert num_logprobs >= 0 + batch_size, vocab_size = logits.shape + logprob_token_ids = sampled_token_ids.unsqueeze(-1) + if num_logprobs > 0: + topk_indices = torch.topk(logits, num_logprobs, dim=-1).indices + logprob_token_ids = torch.cat((sampled_token_ids.unsqueeze(-1), topk_indices), dim=1) + + # NOTE(woosuk): Here, to save GPU memory, we do not materialize the full + # logprobs tensor. Instead, we only compute and return the logprobs of + # the topk + 1 tokens. + logprobs = compute_token_logprobs(logits, logprob_token_ids) + token_ranks = torch.empty( + batch_size, + dtype=torch.int64, + device=logits.device, + ) + + vec_core = get_vectorcore_num() + NUM_CORES = min(batch_size, vec_core) + + rows_per_core = triton.cdiv(batch_size, NUM_CORES) + BLOCK_SIZE = 8192 + grid = (NUM_CORES,) + _ranks_kernel[grid]( + token_ranks, + logits, + logits.stride(0), + sampled_token_ids, + vocab_size, + batch_size, + rows_per_core, + BLOCK_SIZE=BLOCK_SIZE, + multibuffer=False, + ) + + return LogprobsTensors( + logprob_token_ids=logprob_token_ids, + logprobs=logprobs, + selected_token_ranks=token_ranks, + cu_num_generated_tokens=cu_num_logits, + ) +else: + + def compute_topk_logprobs( + logits: torch.Tensor, + num_logprobs: int, + sampled_token_ids: torch.Tensor, + cu_num_logits: list[int] | None = None, + ) -> LogprobsTensors: + assert num_logprobs >= 0 + batch_size, vocab_size = logits.shape + logprob_token_ids = sampled_token_ids.unsqueeze(-1) + if num_logprobs > 0: + topk_indices = torch.topk(logits, num_logprobs, dim=-1).indices + logprob_token_ids = torch.cat((sampled_token_ids.unsqueeze(-1), topk_indices), dim=1) + + # NOTE(woosuk): Here, to save GPU memory, we do not materialize the full + # logprobs tensor. Instead, we only compute and return the logprobs of + # the topk + 1 tokens. + logprobs = compute_token_logprobs(logits, logprob_token_ids) + token_ranks = torch.empty( + batch_size, + dtype=torch.int64, + device=logits.device, + ) + + vec_core = get_vectorcore_num() + NUM_CORES = min(batch_size, vec_core) + + rows_per_core = triton.cdiv(batch_size, NUM_CORES) + BLOCK_SIZE = 8192 + grid = (NUM_CORES,) + _ranks_kernel[grid]( + token_ranks, + logits, + logits.stride(0), + sampled_token_ids, + vocab_size, + batch_size, + rows_per_core, + BLOCK_SIZE=BLOCK_SIZE, + multibuffer=False, + ) + + return LogprobsTensors( + logprob_token_ids=logprob_token_ids, + logprobs=logprobs, + selected_token_ranks=token_ranks, + cu_num_generated_tokens=cu_num_logits, + ) diff --git a/vllm_ascend/worker/v2/spec_decode/eagle/aclgraph.py b/vllm_ascend/worker/v2/spec_decode/eagle/aclgraph.py index ce7bb93e07f..d6c702f22a2 100644 --- a/vllm_ascend/worker/v2/spec_decode/eagle/aclgraph.py +++ b/vllm_ascend/worker/v2/spec_decode/eagle/aclgraph.py @@ -14,9 +14,15 @@ from vllm.v1.worker.gpu.cudagraph_utils import BatchExecutionDescriptor from vllm.v1.worker.gpu.input_batch import InputBuffers from vllm.v1.worker.gpu.model_states.interface import ModelState -from vllm.v1.worker.gpu.spec_decode.eagle.cudagraph import EagleCudaGraphManager from vllm.v1.worker.utils import AttentionGroup +from vllm_ascend.utils import vllm_version_is + +if not vllm_version_is("0.20.1"): + from vllm.v1.worker.gpu.spec_decode.eagle.cudagraph import DecodeEagleCudaGraphManager, PrefillEagleCudaGraphManager +else: + from vllm.v1.worker.gpu.spec_decode.eagle.cudagraph import EagleCudaGraphManager + from vllm_ascend.ascend_forward_context import _EXTRA_CTX from vllm_ascend.compilation.acl_graph import ( set_draft_graph_params, @@ -26,102 +32,309 @@ from vllm_ascend.worker.v2.aclgraph_utils import ModelWithContext from vllm_ascend.worker.v2.utils import communicator_switch +if not vllm_version_is("0.20.1"): + + class PrefillEagleAclGraphManager(PrefillEagleCudaGraphManager): + """AclGraphManager for Eagle speculative decoding.""" + + def __init__( + self, + vllm_config: VllmConfig, + device: torch.device, + cudagraph_mode: CUDAGraphMode, + decode_query_len: int, + speculator: Any, + ): + super().__init__(vllm_config, device, cudagraph_mode, decode_query_len) + + # set speculator attribute, so we can access attributes speculator + # when call `run_fullgraph` method in CudaGraphManager, + # then we don't need to # copy `propose` method in `AscendEagleSpeculator` class. + self.speculator = speculator + # capture_sizes sorts in ascending order. + self.capture_sizes = sorted(self.compilation_config.cudagraph_capture_sizes) + # vllm-ascend need to update draft graph params of attention backend. + # so we need to set draft graph params before capture full graph. + # `prefill` graph and `decodes` graph are different, `decode_query_len` can be used to distinguish them + self.is_draft_model_prefill = decode_query_len > 1 + if super().needs_capture(): + if self.is_draft_model_prefill: + set_draft_graph_prefill_params(self.capture_sizes) + else: + set_draft_graph_params(self.capture_sizes) + + def capture( + self, + forward_fn: Callable, + model_state: ModelState, + input_buffers: InputBuffers, + block_tables: BlockTables, + attn_groups: list[list[AttentionGroup]], + kv_cache_config: KVCacheConfig, + progress_bar_desc: str = "Capturing CUDA graphs", + ) -> None: + """Capture ACL graphs for Eagle.""" + with communicator_switch(), model_capture_wrapper(self.speculator, self.is_draft_model_prefill): + super().capture( + forward_fn, + model_state, + input_buffers, + block_tables, + attn_groups, + kv_cache_config, + progress_bar_desc, + ) -class EagleAclGraphManager(EagleCudaGraphManager): - """AclGraphManager for Eagle speculative decoding.""" - - def __init__( - self, - vllm_config: VllmConfig, - device: torch.device, - cudagraph_mode: CUDAGraphMode, - decode_query_len: int, - speculator: Any, - ): - super().__init__(vllm_config, device, cudagraph_mode, decode_query_len) - - # set speculator attribute, so we can access attributes speculator - # when call `run_fullgraph` method in CudaGraphManager, - # then we don't need to # copy `propose` method in `AscendEagleSpeculator` class. - self.speculator = speculator - # capture_sizes sorts in ascending order. - self.capture_sizes = sorted(self.compilation_config.cudagraph_capture_sizes) - # vllm-ascend need to update draft graph params of attention backend. - # so we need to set draft graph params before capture full graph. - # `prefill` graph and `decodes` graph are different, `decode_query_len` can be used to distinguish them - self.is_draft_model_prefill = decode_query_len > 1 - if super().needs_capture(): + def run_fullgraph( + self, desc: BatchExecutionDescriptor + ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]: + """Override run_fullgraph to update full graph params in run_fullgraph.""" + num_tokens = desc.num_tokens if self.is_draft_model_prefill: - set_draft_graph_prefill_params(self.capture_sizes) + logger.info_once(f"draft prefill run_fullgraph with num_tokens={num_tokens}") else: - set_draft_graph_params(self.capture_sizes) - - def capture( - self, - forward_fn: Callable, - model_state: ModelState, - input_buffers: InputBuffers, - block_tables: BlockTables, - attn_groups: list[list[AttentionGroup]], - kv_cache_config: KVCacheConfig, - progress_bar_desc: str = "Capturing CUDA graphs", - ) -> None: - """Capture ACL graphs for Eagle.""" - with communicator_switch(), model_capture_wrapper(self.speculator, self.is_draft_model_prefill): - super().capture( - forward_fn, - model_state, - input_buffers, - block_tables, - attn_groups, - kv_cache_config, - progress_bar_desc, + logger.info_once(f"draft run_fullgraph with num_tokens={num_tokens}") + + draft_attn_metadatas = self.speculator.build_draft_attn_metadatas( + desc.num_reqs, self.is_draft_model_prefill ) - def run_fullgraph(self, desc: BatchExecutionDescriptor) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]: - """Override run_fullgraph to update full graph params in run_fullgraph.""" - num_tokens = desc.num_tokens - if self.is_draft_model_prefill: - logger.info_once(f"draft prefill run_fullgraph with num_tokens={num_tokens}") - else: - logger.info_once(f"draft run_fullgraph with num_tokens={num_tokens}") - - draft_attn_metadatas = self.speculator.build_draft_attn_metadatas(desc.num_reqs, self.is_draft_model_prefill) - - ret = super().run_fullgraph(desc) - - positions = self.speculator.input_buffers.positions[:num_tokens] - # refer to vllm.v1.worker.gpu.dp_utils.sync_cudagraph_and_dp_padding to - # calculate num_tokens_across_dp. - num_tokens_across_dp = torch.full([self.speculator.dp_size], num_tokens, device=self.device) - with set_forward_context( - self.speculator.model_state.attn_metadata, - self.vllm_config, - num_tokens=num_tokens, - cudagraph_runtime_mode=desc.cg_mode, - num_tokens_across_dp=num_tokens_across_dp, - batch_descriptor=None, # Full graph model don't need batch_descriptor - slot_mapping=None, + ret = super().run_fullgraph(desc) + + positions = self.speculator.input_buffers.positions[:num_tokens] + # refer to vllm.v1.worker.gpu.dp_utils.sync_cudagraph_and_dp_padding to + # calculate num_tokens_across_dp. + num_tokens_across_dp = torch.full([self.speculator.dp_size], num_tokens, device=self.device) + with set_forward_context( + self.speculator.model_state.attn_metadata, + self.vllm_config, + num_tokens=num_tokens, + cudagraph_runtime_mode=desc.cg_mode, + num_tokens_across_dp=num_tokens_across_dp, + batch_descriptor=None, # Full graph model don't need batch_descriptor + slot_mapping=None, + ): + # decide to update draft graph params + _EXTRA_CTX.is_draft_model = True + + # decide to run `prefill` graph or `decodes` graph + _EXTRA_CTX.is_draft_model_prefill = self.is_draft_model_prefill + + forward_context = get_forward_context() + update_full_graph_params( + # FIXME(Ronald1995): support hybrid attn backend + list(self.speculator.attn_backends.values())[0], + self.speculator.update_stream, + forward_context, + num_tokens, + self.vllm_config, + self.speculator.speculative_config, + positions.shape[0], + draft_attn_metadatas=draft_attn_metadatas, + ) + return ret + + class DecodeEagleAclGraphManager(DecodeEagleCudaGraphManager): + """AclGraphManager for Eagle speculative decoding.""" + + def __init__( + self, + vllm_config: VllmConfig, + device: torch.device, + cudagraph_mode: CUDAGraphMode, + decode_query_len: int, + speculator: Any, ): - # decide to update draft graph params - _EXTRA_CTX.is_draft_model = True - - # decide to run `prefill` graph or `decodes` graph - _EXTRA_CTX.is_draft_model_prefill = self.is_draft_model_prefill - - forward_context = get_forward_context() - update_full_graph_params( - # FIXME(Ronald1995): support hybrid attn backend - list(self.speculator.attn_backends.values())[0], - self.speculator.update_stream, - forward_context, - num_tokens, + super().__init__(vllm_config, device, cudagraph_mode, decode_query_len) + + # set speculator attribute, so we can access attributes speculator + # when call `run_fullgraph` method in CudaGraphManager, + # then we don't need to # copy `propose` method in `AscendEagleSpeculator` class. + self.speculator = speculator + # capture_sizes sorts in ascending order. + self.capture_sizes = sorted(self.compilation_config.cudagraph_capture_sizes) + # vllm-ascend need to update draft graph params of attention backend. + # so we need to set draft graph params before capture full graph. + # `prefill` graph and `decodes` graph are different, `decode_query_len` can be used to distinguish them + self.is_draft_model_prefill = decode_query_len > 1 + if super().needs_capture(): + if self.is_draft_model_prefill: + set_draft_graph_prefill_params(self.capture_sizes) + else: + set_draft_graph_params(self.capture_sizes) + + def capture( + self, + forward_fn: Callable, + model_state: ModelState, + input_buffers: InputBuffers, + block_tables: BlockTables, + attn_groups: list[list[AttentionGroup]], + kv_cache_config: KVCacheConfig, + progress_bar_desc: str = "Capturing CUDA graphs", + ) -> None: + """Capture ACL graphs for Eagle.""" + with communicator_switch(), model_capture_wrapper(self.speculator, self.is_draft_model_prefill): + super().capture( + forward_fn, + model_state, + input_buffers, + block_tables, + attn_groups, + kv_cache_config, + progress_bar_desc, + ) + + def run_fullgraph( + self, desc: BatchExecutionDescriptor + ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]: + """Override run_fullgraph to update full graph params in run_fullgraph.""" + num_tokens = desc.num_tokens + if self.is_draft_model_prefill: + logger.info_once(f"draft prefill run_fullgraph with num_tokens={num_tokens}") + else: + logger.info_once(f"draft run_fullgraph with num_tokens={num_tokens}") + + draft_attn_metadatas = self.speculator.build_draft_attn_metadatas( + desc.num_reqs, self.is_draft_model_prefill + ) + + ret = super().run_fullgraph(desc) + + positions = self.speculator.input_buffers.positions[:num_tokens] + # refer to vllm.v1.worker.gpu.dp_utils.sync_cudagraph_and_dp_padding to + # calculate num_tokens_across_dp. + num_tokens_across_dp = torch.full([self.speculator.dp_size], num_tokens, device=self.device) + with set_forward_context( + self.speculator.model_state.attn_metadata, self.vllm_config, - self.speculator.speculative_config, - positions.shape[0], - draft_attn_metadatas=draft_attn_metadatas, + num_tokens=num_tokens, + cudagraph_runtime_mode=desc.cg_mode, + num_tokens_across_dp=num_tokens_across_dp, + batch_descriptor=None, # Full graph model don't need batch_descriptor + slot_mapping=None, + ): + # decide to update draft graph params + _EXTRA_CTX.is_draft_model = True + + # decide to run `prefill` graph or `decodes` graph + _EXTRA_CTX.is_draft_model_prefill = self.is_draft_model_prefill + + forward_context = get_forward_context() + update_full_graph_params( + # FIXME(Ronald1995): support hybrid attn backend + list(self.speculator.attn_backends.values())[0], + self.speculator.update_stream, + forward_context, + num_tokens, + self.vllm_config, + self.speculator.speculative_config, + positions.shape[0], + draft_attn_metadatas=draft_attn_metadatas, + ) + return ret + +else: + + class EagleAclGraphManager(EagleCudaGraphManager): + """AclGraphManager for Eagle speculative decoding.""" + + def __init__( + self, + vllm_config: VllmConfig, + device: torch.device, + cudagraph_mode: CUDAGraphMode, + decode_query_len: int, + speculator: Any, + ): + super().__init__(vllm_config, device, cudagraph_mode, decode_query_len) + + # set speculator attribute, so we can access attributes speculator + # when call `run_fullgraph` method in CudaGraphManager, + # then we don't need to # copy `propose` method in `AscendEagleSpeculator` class. + self.speculator = speculator + # capture_sizes sorts in ascending order. + self.capture_sizes = sorted(self.compilation_config.cudagraph_capture_sizes) + # vllm-ascend need to update draft graph params of attention backend. + # so we need to set draft graph params before capture full graph. + # `prefill` graph and `decodes` graph are different, `decode_query_len` can be used to distinguish them + self.is_draft_model_prefill = decode_query_len > 1 + if super().needs_capture(): + if self.is_draft_model_prefill: + set_draft_graph_prefill_params(self.capture_sizes) + else: + set_draft_graph_params(self.capture_sizes) + + def capture( + self, + forward_fn: Callable, + model_state: ModelState, + input_buffers: InputBuffers, + block_tables: BlockTables, + attn_groups: list[list[AttentionGroup]], + kv_cache_config: KVCacheConfig, + progress_bar_desc: str = "Capturing CUDA graphs", + ) -> None: + """Capture ACL graphs for Eagle.""" + with communicator_switch(), model_capture_wrapper(self.speculator, self.is_draft_model_prefill): + super().capture( + forward_fn, + model_state, + input_buffers, + block_tables, + attn_groups, + kv_cache_config, + progress_bar_desc, + ) + + def run_fullgraph( + self, desc: BatchExecutionDescriptor + ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]: + """Override run_fullgraph to update full graph params in run_fullgraph.""" + num_tokens = desc.num_tokens + if self.is_draft_model_prefill: + logger.info_once(f"draft prefill run_fullgraph with num_tokens={num_tokens}") + else: + logger.info_once(f"draft run_fullgraph with num_tokens={num_tokens}") + + draft_attn_metadatas = self.speculator.build_draft_attn_metadatas( + desc.num_reqs, self.is_draft_model_prefill ) - return ret + + ret = super().run_fullgraph(desc) + + positions = self.speculator.input_buffers.positions[:num_tokens] + # refer to vllm.v1.worker.gpu.dp_utils.sync_cudagraph_and_dp_padding to + # calculate num_tokens_across_dp. + num_tokens_across_dp = torch.full([self.speculator.dp_size], num_tokens, device=self.device) + with set_forward_context( + self.speculator.model_state.attn_metadata, + self.vllm_config, + num_tokens=num_tokens, + cudagraph_runtime_mode=desc.cg_mode, + num_tokens_across_dp=num_tokens_across_dp, + batch_descriptor=None, # Full graph model don't need batch_descriptor + slot_mapping=None, + ): + # decide to update draft graph params + _EXTRA_CTX.is_draft_model = True + + # decide to run `prefill` graph or `decodes` graph + _EXTRA_CTX.is_draft_model_prefill = self.is_draft_model_prefill + + forward_context = get_forward_context() + update_full_graph_params( + # FIXME(Ronald1995): support hybrid attn backend + list(self.speculator.attn_backends.values())[0], + self.speculator.update_stream, + forward_context, + num_tokens, + self.vllm_config, + self.speculator.speculative_config, + positions.shape[0], + draft_attn_metadatas=draft_attn_metadatas, + ) + return ret @contextmanager diff --git a/vllm_ascend/worker/v2/spec_decode/eagle/speculator.py b/vllm_ascend/worker/v2/spec_decode/eagle/speculator.py index 1f0fc0c7798..044d8a88743 100644 --- a/vllm_ascend/worker/v2/spec_decode/eagle/speculator.py +++ b/vllm_ascend/worker/v2/spec_decode/eagle/speculator.py @@ -31,13 +31,22 @@ from vllm.v1.worker.gpu.input_batch import InputBatch from vllm.v1.worker.gpu.model_states.interface import ModelState from vllm.v1.worker.gpu.spec_decode.eagle import speculator as vllm_speculator -from vllm.v1.worker.gpu.spec_decode.eagle.cudagraph import EagleCudaGraphManager -from vllm.v1.worker.gpu.spec_decode.eagle.speculator import EagleSpeculator, gumbel_sample, update_eagle_inputs from vllm_ascend.attention.attention_v1 import AscendAttentionState +from vllm_ascend.utils import vllm_version_is from vllm_ascend.worker.v2.attn_utils import build_attn_metadata from vllm_ascend.worker.v2.input_batch import AscendInputBuffers -from vllm_ascend.worker.v2.spec_decode.eagle.aclgraph import EagleAclGraphManager + +if not vllm_version_is("0.20.1"): + from vllm.v1.worker.gpu.spec_decode.eagle.cudagraph import PrefillEagleCudaGraphManager + from vllm.v1.worker.gpu.spec_decode.eagle.speculator import EagleSpeculator, update_eagle_draft_inputs + + from vllm_ascend.worker.v2.spec_decode.eagle.aclgraph import PrefillEagleAclGraphManager +else: + from vllm.v1.worker.gpu.spec_decode.eagle.cudagraph import EagleCudaGraphManager + from vllm.v1.worker.gpu.spec_decode.eagle.speculator import EagleSpeculator, gumbel_sample, update_eagle_inputs + + from vllm_ascend.worker.v2.spec_decode.eagle.aclgraph import EagleAclGraphManager class AscendEagleSpeculator(EagleSpeculator): @@ -159,28 +168,26 @@ def set_attn( self.attn_backends = attn_backends - def generate_draft( - self, - num_reqs: int, - num_tokens_padded: int, - attn_metadata: dict[str, Any], - slot_mappings: dict[str, torch.Tensor], - num_tokens_across_dp: torch.Tensor | None, - cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, - ): - """Override GPU EagleSpeculator.generate_draft for Ascend NPUs, because - attn_metadata is created in super propose method, it does not have some - attribute that Ascend attention backend needs, so we update it. - """ - self._init_decode_attn_metadata(attn_metadata, num_reqs) - self._increment_decode_attn_metadata(attn_metadata) - - # NOTE(drslark): following lines (from 145 to 184) come from raw gpu's generate_draft logic - pos = self.input_buffers.positions[:num_reqs] - query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1] - idx_mapping = self.idx_mapping[:num_reqs] - for step in range(1, self.num_speculative_steps): - # Run the eagle model. + if not vllm_version_is("0.20.1"): + + def generate_draft( + self, + num_reqs: int, + num_tokens_padded: int, + attn_metadata: dict[str, Any], + slot_mappings: dict[str, torch.Tensor], + num_tokens_across_dp: torch.Tensor | None, + cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, + ): + """Override GPU EagleSpeculator.generate_draft for Ascend NPUs, because + attn_metadata is created in super propose method, it does not have some + attribute that Ascend attention backend needs, so we update it. + """ + self._init_decode_attn_metadata(attn_metadata, num_reqs) + self._increment_decode_attn_metadata(attn_metadata) + idx_mapping = self.idx_mapping[:num_reqs] + positions = self.input_buffers.positions[:num_reqs] + # Run the eagle model forward pass. last_hidden_states, hidden_states = self.run_model( num_tokens_padded, attn_metadata, @@ -189,36 +196,93 @@ def generate_draft( cudagraph_runtime_mode, ) last_hidden_states = last_hidden_states[:num_reqs] - hidden_states = hidden_states[:num_reqs] - logits = self.model.compute_logits(last_hidden_states) - # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise - # used for draft and target sampling. - draft_tokens = gumbel_sample( + # Sample the draft tokens. + logits = self.model.compute_logits(last_hidden_states) + draft_tokens = self._sample_draft( logits, idx_mapping, - self.temperature, - self.seeds, - pos + 1, - apply_temperature=True, - processed_logits_out=self.draft_logits[:, step] if self.draft_logits is not None else None, + positions, + self.current_draft_step, + self.draft_logits, ) - self.draft_tokens[:num_reqs, step] = draft_tokens - - if step < self.num_speculative_steps - 1: - # Update the inputs for the next step. - update_eagle_inputs( - draft_tokens, - hidden_states, - self.input_buffers, - self.hidden_states, - self.max_model_len, - ) - if attn_metadata is not None: - self.block_tables.compute_slot_mappings(idx_mapping, query_start_loc, pos, num_tokens_padded) - # npu's own update logic - self._increment_decode_attn_metadata(attn_metadata) + # Update the inputs for the next step. + update_eagle_draft_inputs( + draft_tokens, + self.current_draft_step, + hidden_states, + self.draft_tokens, + self.hidden_states, + self.input_buffers, + num_reqs, + self.max_model_len, + self.num_speculative_steps, + ) + # npu's own update logic + self._increment_decode_attn_metadata(attn_metadata) + else: + + def generate_draft( + self, + num_reqs: int, + num_tokens_padded: int, + attn_metadata: dict[str, Any], + slot_mappings: dict[str, torch.Tensor], + num_tokens_across_dp: torch.Tensor | None, + cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, + ): + """Override GPU EagleSpeculator.generate_draft for Ascend NPUs, because + attn_metadata is created in super propose method, it does not have some + attribute that Ascend attention backend needs, so we update it. + """ + self._init_decode_attn_metadata(attn_metadata, num_reqs) + self._increment_decode_attn_metadata(attn_metadata) + + # NOTE(drslark): following lines (from 145 to 184) come from raw gpu's generate_draft logic + pos = self.input_buffers.positions[:num_reqs] + query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1] + idx_mapping = self.idx_mapping[:num_reqs] + for step in range(1, self.num_speculative_steps): + # Run the eagle model. + last_hidden_states, hidden_states = self.run_model( + num_tokens_padded, + attn_metadata, + slot_mappings, + num_tokens_across_dp, + cudagraph_runtime_mode, + ) + last_hidden_states = last_hidden_states[:num_reqs] + hidden_states = hidden_states[:num_reqs] + logits = self.model.compute_logits(last_hidden_states) + + # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise + # used for draft and target sampling. + draft_tokens = gumbel_sample( + logits, + idx_mapping, + self.temperature, + self.seeds, + pos + 1, + apply_temperature=True, + processed_logits_out=self.draft_logits[:, step] if self.draft_logits is not None else None, + ) + self.draft_tokens[:num_reqs, step] = draft_tokens + + if step < self.num_speculative_steps - 1: + # Update the inputs for the next step. + update_eagle_inputs( + draft_tokens, + hidden_states, + self.input_buffers, + self.hidden_states, + self.max_model_len, + ) + if attn_metadata is not None: + self.block_tables.compute_slot_mappings(idx_mapping, query_start_loc, pos, num_tokens_padded) + + # npu's own update logic + self._increment_decode_attn_metadata(attn_metadata) @torch.inference_mode() def run_model( @@ -382,16 +446,37 @@ def torch_gather_wrapper(): torch.gather = original_gather -@contextmanager -def graph_manager_wrapper(speculator): - """Context manager to override graph manager.""" - original_graph_manager = EagleCudaGraphManager - - def factory(vllm_config: VllmConfig, device: torch.device, cudagraph_mode: CUDAGraphMode, decode_query_len: int): - return EagleAclGraphManager(vllm_config, device, cudagraph_mode, decode_query_len, speculator) - - try: - vllm_speculator.EagleCudaGraphManager = factory - yield - finally: - vllm_speculator.EagleCudaGraphManager = original_graph_manager +if not vllm_version_is("0.20.1"): + + @contextmanager + def graph_manager_wrapper(speculator): + """Context manager to override graph manager.""" + original_graph_manager = PrefillEagleCudaGraphManager + + def factory( + vllm_config: VllmConfig, device: torch.device, cudagraph_mode: CUDAGraphMode, decode_query_len: int + ): + return PrefillEagleAclGraphManager(vllm_config, device, cudagraph_mode, decode_query_len, speculator) + + try: + vllm_speculator.PrefillEagleCudaGraphManager = factory + yield + finally: + vllm_speculator.PrefillEagleCudaGraphManager = original_graph_manager +else: + + @contextmanager + def graph_manager_wrapper(speculator): + """Context manager to override graph manager.""" + original_graph_manager = EagleCudaGraphManager + + def factory( + vllm_config: VllmConfig, device: torch.device, cudagraph_mode: CUDAGraphMode, decode_query_len: int + ): + return EagleAclGraphManager(vllm_config, device, cudagraph_mode, decode_query_len, speculator) + + try: + vllm_speculator.EagleCudaGraphManager = factory + yield + finally: + vllm_speculator.EagleCudaGraphManager = original_graph_manager diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index 4a2f84ad798..722ec077021 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -44,10 +44,7 @@ from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput from vllm.v1.worker.gpu_worker import AsyncIntermediateTensors -from vllm.v1.worker.worker_base import ( - CompilationTimes, # noqa: E402 - WorkerBase, -) +from vllm.v1.worker.worker_base import CompilationTimes, WorkerBase from vllm.v1.worker.workspace import init_workspace_manager import vllm_ascend.envs as envs_ascend @@ -473,7 +470,7 @@ def load_model(self) -> None: with context, set_current_vllm_config(self.vllm_config): self.model_runner.load_model() - def compile_or_warm_up_model(self): + def compile_or_warm_up_model(self) -> CompilationTimes: # Note: need to adapt for graph mode. warmup_sizes = (self.vllm_config.compilation_config.compile_sizes or []).copy() if not self.model_config.enforce_eager: @@ -553,10 +550,15 @@ def compile_or_warm_up_model(self): # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. set_random_seed(self.model_config.seed) - return CompilationTimes( language_model=self.vllm_config.compilation_config.compilation_time, - encoder=self.compilation_config.encoder_compilation_time, + # `encoder_compilation_time` was added after v0.19.1 (vLLM #39240); fall + # back to 0.0 so the older release still constructs CompilationTimes. + encoder=getattr( + self.vllm_config.compilation_config, + "encoder_compilation_time", + 0.0, + ), ) def _warm_up_atb(self):