diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 9ef5650e24b..ef7c621acf2 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -27,7 +27,7 @@ on: continue_on_error: required: false type: boolean - default: false + default: true # The following inputs are used by comment-triggered E2E tests (/e2e ). # They carry space-separated pytest paths, categorized by runner type. # Leave empty (default) when running label-triggered full/light suites. diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 879bc4efe44..fd84237ef88 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -80,7 +80,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8] + vllm_version: [132765e3560659ff63ebd236203672e991b70e08] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 2f6678c126c..70927d83445 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: d886c26d4d4fef7d079696beb4ece1cfb4b008a8 + vllm: 132765e3560659ff63ebd236203672e991b70e08 changes: runs-on: linux-aarch64-a2b3-0 container: @@ -154,7 +154,7 @@ jobs: if: ${{ needs.lint.result == 'success' && needs.changes.outputs.has_tests == 'true' }} strategy: matrix: - vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8] + vllm_version: [132765e3560659ff63ebd236203672e991b70e08] uses: ./.github/workflows/_optional_smart_e2e.yaml with: vllm: ${{ matrix.vllm_version }} @@ -164,7 +164,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8] + vllm_version: [132765e3560659ff63ebd236203672e991b70e08] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/scripts/config.yaml b/.github/workflows/scripts/config.yaml index 84ac12f9107..93fe09f81e7 100644 --- a/.github/workflows/scripts/config.yaml +++ b/.github/workflows/scripts/config.yaml @@ -31,8 +31,11 @@ e2e-singlecard: estimated_time: 222 - name: tests/e2e/singlecard/test_qwen3_multi_loras.py estimated_time: 100 -- name: tests/e2e/singlecard/test_models.py - estimated_time: 315 +- name: tests/e2e/singlecard/test_models.py::test_minicpm + estimated_time: 158 +- name: tests/e2e/singlecard/test_models.py::test_whisper + estimated_time: 157 + is_skipped: true - name: tests/e2e/singlecard/test_multistream_overlap_shared_expert.py estimated_time: 253 - name: tests/e2e/singlecard/test_quantization.py @@ -110,6 +113,7 @@ e2e-multicard-2-cards: estimated_time: 178 - name: tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek_w4a8_accuracy_tp2 estimated_time: 127 + is_skipped: true - name: tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_moe_fc2_tp2 estimated_time: 149 - name: tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek_v2_lite_fc1_tp2 @@ -128,8 +132,17 @@ e2e-multicard-2-cards: estimated_time: 400 - name: tests/e2e/multicard/2-cards/test_quantization.py estimated_time: 482 -- name: tests/e2e/multicard/2-cards/test_qwen3_moe.py - estimated_time: 974 +- name: tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_distributed_mp_tp2_ep + estimated_time: 195 +- name: tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_w8a8_distributed_tp2 + estimated_time: 195 +- name: tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_distributed_aiv_tp2 + estimated_time: 195 +- name: tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_distributed_tp2_ep2_mrv2 + estimated_time: 195 + is_skipped: true +- name: tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb + estimated_time: 194 - name: tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py estimated_time: 193 - name: tests/e2e/multicard/2-cards/test_single_request_aclgraph.py @@ -149,12 +162,35 @@ e2e-multicard-4-cards: estimated_time: 322 - name: tests/e2e/multicard/4-cards/test_kimi_k2.py estimated_time: 37 -- name: tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py - estimated_time: 1287 +- name: tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py::test_models_long_sequence_output_between_tp_and_cp + estimated_time: 257 + is_skipped: true +- name: tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py::test_accuracy_dcp_only_graph + estimated_time: 257 + is_skipped: true +- name: tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py::test_accuracy_dcp_only_eager + estimated_time: 257 + is_skipped: true +- name: tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py::test_accuracy_pcp_only + estimated_time: 257 + is_skipped: true +- name: tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py::test_models_long_sequence_cp_kv_interleave_size_output_between_tp_and_cp + estimated_time: 259 - name: tests/e2e/multicard/4-cards/long_sequence/test_basic.py estimated_time: 2179 -- name: tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill_cp.py - estimated_time: 1173 +- name: tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill_cp.py::test_models_chunked_prefill_mixed_length_prompts_including_1_token + estimated_time: 235 +- name: tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill_cp.py::test_models_chunked_prefill_with_empty_kvcache + estimated_time: 235 +- name: tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill_cp.py::test_models_chunked_prefill_with_cp_basic + estimated_time: 235 + is_skipped: true +- name: tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill_cp.py::test_models_chunked_prefill_with_cp_piecewise + estimated_time: 235 + is_skipped: true +- name: tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill_cp.py::test_models_chunked_prefill_with_cp_full_graph + estimated_time: 233 + is_skipped: true - name: tests/e2e/multicard/4-cards/long_sequence/test_prefix_caching_cp.py estimated_time: 850 - name: tests/e2e/multicard/4-cards/long_sequence/test_mtp.py diff --git a/docs/source/conf.py b/docs/source/conf.py index 5020f4312ac..536ee61a2a7 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -81,7 +81,7 @@ # CANN image tag "cann_image_tag": "8.5.1-910b-ubuntu22.04-py3.11", # vLLM commit hash for main branch - "main_vllm_commit": "d886c26d4d4fef7d079696beb4ece1cfb4b008a8", + "main_vllm_commit": "132765e3560659ff63ebd236203672e991b70e08", # vLLM tag for main branch "main_vllm_tag": "v0.19.1", # Python version for main branch diff --git a/tests/ut/_310p/fused_moe/test_shared_fused_moe_310.py b/tests/ut/_310p/fused_moe/test_shared_fused_moe_310.py index 545dfe0d65d..2a97cf622c5 100644 --- a/tests/ut/_310p/fused_moe/test_shared_fused_moe_310.py +++ b/tests/ut/_310p/fused_moe/test_shared_fused_moe_310.py @@ -20,7 +20,6 @@ from vllm_ascend._310p.fused_moe.fused_moe import ( AscendFusedMoE310, - AscendSharedFusedMoE310, ) @@ -48,8 +47,8 @@ def forward(self, hidden_states: torch.Tensor): return out -def _build_layer(shared_experts: torch.nn.Module | None) -> AscendSharedFusedMoE310: - layer = AscendSharedFusedMoE310.__new__(AscendSharedFusedMoE310) +def _build_layer(shared_experts: torch.nn.Module | None) -> AscendFusedMoE310: + layer = AscendFusedMoE310.__new__(AscendFusedMoE310) # The test bypasses full layer init with __new__, so we must initialize # nn.Module internals before assigning child modules. torch.nn.Module.__init__(layer) @@ -80,7 +79,7 @@ def test_forward_impl_with_shared_experts_returns_tuple_310(): routed_out = torch.randn(3, 8) with patch.object(AscendFusedMoE310, "forward_impl", return_value=routed_out): - shared_out, routed = layer.forward_impl(hidden_states, router_logits) + shared_out, routed = layer.shared_forward_impl(hidden_states, router_logits) expected_shared = 0.5 * (hidden_states * 2.0 + 1.0) torch.testing.assert_close(shared_out, expected_shared) @@ -100,7 +99,7 @@ def test_forward_impl_without_shared_experts_returns_routed_only_310(): routed_out = torch.randn(3, 8) with patch.object(AscendFusedMoE310, "forward_impl", return_value=routed_out): - output = layer.forward_impl(hidden_states, router_logits) + output = layer.shared_forward_impl(hidden_states, router_logits) torch.testing.assert_close(output, routed_out) diff --git a/tests/ut/ops/test_fused_moe.py b/tests/ut/ops/test_fused_moe.py index f21d4dc82f8..fcb87262da0 100644 --- a/tests/ut/ops/test_fused_moe.py +++ b/tests/ut/ops/test_fused_moe.py @@ -236,6 +236,12 @@ def moe_method(mock_dist_env): return AscendUnquantizedFusedMoEMethod(moe) +def test_ascend_unquantized_skips_upstream_modular_kernel_init(): + method = AscendUnquantizedFusedMoEMethod.maybe_make_prepare_finalize + + assert method(object()) is None + + class Device(TypedDict): device_id: int device_expert: list[int] diff --git a/tests/ut/spec_decode/test_eagle_proposer.py b/tests/ut/spec_decode/test_eagle_proposer.py index 6a808e9154b..64bdb5a0830 100644 --- a/tests/ut/spec_decode/test_eagle_proposer.py +++ b/tests/ut/spec_decode/test_eagle_proposer.py @@ -17,6 +17,17 @@ from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.spec_decode.draft_proposer import AscendDraftModelProposer from vllm_ascend.spec_decode.eagle_proposer import AscendEagleProposer +from vllm_ascend.utils import vllm_version_is + +# vLLM #40732 moved `SpecDecodeBaseProposer` (and its `CpuGpuBuffer` import) +# out of `vllm.v1.spec_decode.eagle` into `vllm.v1.spec_decode.llm_base_proposer`. +# Pick the right patch path depending on the installed vllm version so the +# tests can mock the buffer factory. +_CPU_GPU_BUFFER_TARGET = ( + "vllm.v1.spec_decode.eagle.CpuGpuBuffer" + if vllm_version_is("0.19.1") + else "vllm.v1.spec_decode.llm_base_proposer.CpuGpuBuffer" +) class TestEagleProposerInitialization(TestBase): @@ -51,13 +62,15 @@ def setUp(self): self.vllm_config.parallel_config.enable_expert_parallel = False self.vllm_config.speculative_config.draft_tensor_parallel_size = 1 self.vllm_config.speculative_config.num_speculative_tokens = 2 + self.vllm_config.speculative_config.parallel_drafting = False self.vllm_config.speculative_config.speculative_token_tree = str([(i + 1) * (0,) for i in range(2)]) + self.vllm_config.speculative_config.draft_model_config.hf_config = MagicMock(spec=[]) self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0 self.vllm_config.speculative_config.draft_model_config.uses_mrope = False self.vllm_config.speculative_config.disable_padded_drafter_batch = False self.vllm_config.additional_config = None - self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer") + self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET) self.mock_cpugpubuffer.start() self.mock_supports_multimodal_inputs = patch( "vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False @@ -76,6 +89,7 @@ def tearDown(self): def test_initialization_eagle_graph(self): self.vllm_config.speculative_config.method = "eagle" self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096 + self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 4096 self.vllm_config.speculative_config.draft_model_config.uses_mrope = False self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE self.vllm_config.model_config.enforce_eager = False @@ -99,6 +113,7 @@ def test_initialization_eagle_graph(self): def test_initialization_eagle3_enforce_eager(self): self.vllm_config.speculative_config.method = "eagle3" self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048 + self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 2048 self.vllm_config.compilation_config.mode = CompilationMode.NONE self.vllm_config.compilation_config.pass_config = MagicMock() self.vllm_config.compilation_config.pass_config.enable_sp = False @@ -116,6 +131,7 @@ def test_initialization_eagle3_enforce_eager(self): def test_initialization_eagle3_full_graph_async(self): self.vllm_config.speculative_config.method = "eagle3" self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048 + self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 2048 self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE self.vllm_config.model_config.enforce_eager = False self.vllm_config.speculative_config.enforce_eager = False @@ -133,6 +149,7 @@ def test_initialization_eagle3_full_graph_async(self): def test_initialization_mtp_full_graph_async(self): self.vllm_config.speculative_config.method = "mtp" self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048 + self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 2048 self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE self.vllm_config.model_config.enforce_eager = False self.vllm_config.speculative_config.enforce_eager = False @@ -196,7 +213,7 @@ def setUp(self): self.vllm_config.additional_config = None init_ascend_config(self.vllm_config) - self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer") + self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET) self.mock_cpugpubuffer.start() self.mock_supports_multimodal_inputs = patch( "vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False @@ -332,7 +349,7 @@ def setUp(self): self.vllm_config.additional_config = None init_ascend_config(self.vllm_config) - self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer") + self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET) self.mock_cpugpubuffer.start() self.mock_supports_multimodal_inputs = patch( "vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False @@ -483,7 +500,7 @@ def setUp(self): self.vllm_config.additional_config = None init_ascend_config(self.vllm_config) - self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer") + self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET) self.mock_cpugpubuffer.start() self.mock_supports_multimodal_inputs = patch( "vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False @@ -558,7 +575,7 @@ def setUp_and_tearDown(self): self.vllm_config.additional_config = None init_ascend_config(self.vllm_config) - self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer") + self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET) self.mock_cpugpubuffer.start() self.mock_supports_multimodal_inputs = patch( "vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False @@ -1263,7 +1280,7 @@ def setUp(self): self.vllm_config.additional_config = None init_ascend_config(self.vllm_config) - self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer", MockCpuGpuBuffer) + self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET, MockCpuGpuBuffer) self.mock_cpugpubuffer.start() self.mock_supports_multimodal_inputs = patch( "vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False @@ -1747,6 +1764,7 @@ def setUp(self): self.vllm_config.speculative_config.use_local_argmax_reduction = False self.vllm_config.speculative_config.draft_tensor_parallel_size = 1 self.vllm_config.speculative_config.speculative_token_tree = str([(i + 1) * (0,) for i in range(3)]) + self.vllm_config.speculative_config.draft_model_config.hf_config = MagicMock(spec=[]) self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 4 self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 4 self.vllm_config.speculative_config.draft_model_config.uses_mrope = False @@ -1755,7 +1773,7 @@ def setUp(self): self.vllm_config.additional_config = None init_ascend_config(self.vllm_config) - self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer", MockCpuGpuBuffer) + self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET, MockCpuGpuBuffer) self.mock_cpugpubuffer.start() self.mock_supports_multimodal_inputs = patch( "vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False @@ -1876,7 +1894,14 @@ def check_mock(self): import vllm.v1.spec_decode.eagle - assert hasattr(vllm.v1.spec_decode.eagle, "CpuGpuBuffer") + # `CpuGpuBuffer` was re-exported from `eagle` until vLLM #40732 moved + # `SpecDecodeBaseProposer` (and the import) into `llm_base_proposer`. + if vllm_version_is("0.19.1"): + assert hasattr(vllm.v1.spec_decode.eagle, "CpuGpuBuffer") + else: + import vllm.v1.spec_decode.llm_base_proposer + + assert hasattr(vllm.v1.spec_decode.llm_base_proposer, "CpuGpuBuffer") RunnerCls = vllm.v1.spec_decode.eagle.SpecDecodeBaseProposer for attr in ("_get_positions", "_set_positions"): assert hasattr(RunnerCls, attr), f"SpecDecodeBaseProposer.{attr} not found" diff --git a/vllm_ascend/_310p/fused_moe/fused_moe.py b/vllm_ascend/_310p/fused_moe/fused_moe.py index 7c81d6a7336..5b06e9138bb 100644 --- a/vllm_ascend/_310p/fused_moe/fused_moe.py +++ b/vllm_ascend/_310p/fused_moe/fused_moe.py @@ -20,7 +20,6 @@ from vllm.distributed import get_dp_group, get_ep_group, get_tp_group from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod -from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE from vllm_ascend.ascend_forward_context import _EXTRA_CTX, MoECommType from vllm_ascend.ops.fused_moe.experts_selector import zero_experts_compute @@ -40,6 +39,11 @@ def __init__(self, moe: FusedMoEConfig = None): def is_monolithic(self) -> bool: return False + def maybe_make_prepare_finalize(self, routing_tables=None): + # Ascend 310P uses its own MoE communication and forward_impl path. + # Do not let upstream modular-kernel initialization replace it. + return None + def process_weights_after_loading(self, layer): super().process_weights_after_loading(layer) @@ -119,6 +123,8 @@ class AscendFusedMoE310(FusedMoE): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self._routed_input_transform = kwargs.get("routed_input_transform") + self._shared_experts = kwargs.get("shared_experts") self.global_num_experts = kwargs["num_experts"] if self.quant_config is None: @@ -127,6 +133,10 @@ def __init__(self, *args, **kwargs): self.quant_method = self.quant_config.get_quant_method(self, self.layer_name) assert self.quant_method is not None + # Keep base_quant_method aligned with the Ascend-replaced quant_method + # so FusedMoE.maybe_init_modular_kernel doesn't dispatch into the + # upstream UnquantizedFusedMoEMethod.maybe_make_prepare_finalize. + self.base_quant_method = self.quant_method self.moe_config.tp_group = get_tp_group() self.moe_config.dp_group = get_dp_group() @@ -175,6 +185,11 @@ def __init__(self, *args, **kwargs): self.vllm_config.parallel_config.enable_dbo, ) + @property + def is_internal_router(self) -> bool: + # 310P Ascend path expects router logits from the model forward path. + return False + def init_experts_map(self, moe_config): """ Initialize expert mapping for MoE (Mixture of Experts) model. @@ -260,68 +275,12 @@ def forward_impl( # type: ignore[override] return routed_out - -class AscendSharedFusedMoE310(SharedFusedMoE, AscendFusedMoE310): - def __init__( - self, - shared_experts: torch.nn.Module, - gate: torch.nn.Module | None = None, - use_overlapped: bool = True, - routed_input_transform: torch.nn.Module | None = None, - **kwargs, - ): - AscendFusedMoE310.__init__(self, **kwargs) - self._routed_input_transform = routed_input_transform - self._shared_experts = shared_experts - self.use_overlapped = use_overlapped - self.shared_expert_stream = None - self._gate = gate - # Recreate runner after shared_experts/gate are set so custom op dispatch - # goes through moe_forward_shared. - # NOTE: must use self._shared_experts here, not self.shared_experts — - # FusedMoE.shared_experts is a property that reads self.runner.shared_experts, - # which at this point is still the stale runner built with shared_experts=None. - from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner - - self.runner = AscendMoERunner( - self.layer_name, - self.moe_config, - self.router, - self._routed_input_transform, - self._gate, - self._shared_experts, - self.quant_method, - self.reduce_results, - self.vllm_config.parallel_config.enable_dbo, - ) - - @property - def is_internal_router(self) -> bool: - # 310P Ascend path expects router logits from the model forward path. - return False - - def forward( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - result = AscendFusedMoE310.forward( - self, - hidden_states=hidden_states, - router_logits=router_logits, - ) - # When shared experts are absent, the parent returns only fused_out; - # otherwise it returns a (shared_out, fused_out) tuple. - if self._shared_experts is None: - return None, result - return result - def _forward_shared_experts(self, hidden_states: torch.Tensor): if self._shared_experts is None: return None return self._shared_experts(hidden_states) - def forward_impl( # type: ignore[override] + def shared_forward_impl( # type: ignore[override] self, hidden_states: torch.Tensor, router_logits: torch.Tensor ): routed_out = AscendFusedMoE310.forward_impl( diff --git a/vllm_ascend/__init__.py b/vllm_ascend/__init__.py index c5b7d1b0445..e90b796b4e1 100644 --- a/vllm_ascend/__init__.py +++ b/vllm_ascend/__init__.py @@ -15,6 +15,25 @@ # This file is a part of the vllm-ascend project. # +_GLOBAL_PATCH_APPLIED = False + + +def _ensure_global_patch(): + """Apply process-wide vLLM patches before engine-core initialization. + + vLLM loads general plugins in engine-core subprocesses. E2E test + conftest hooks do not run there, so global patches that affect scheduler + and engine code must also be applied through these plugin entry points. + """ + global _GLOBAL_PATCH_APPLIED + if _GLOBAL_PATCH_APPLIED: + return + + from vllm_ascend.utils import adapt_patch + + adapt_patch(is_global_patch=True) + _GLOBAL_PATCH_APPLIED = True + def register(): """Register the NPU platform.""" @@ -23,12 +42,16 @@ def register(): def register_connector(): + _ensure_global_patch() + from vllm_ascend.distributed.kv_transfer import register_connector register_connector() def register_model_loader(): + _ensure_global_patch() + from .model_loader.netloader import register_netloader from .model_loader.rfork import register_rforkloader @@ -37,6 +60,8 @@ def register_model_loader(): def register_service_profiling(): + _ensure_global_patch() + from .profiling_config import generate_service_profiling_config generate_service_profiling_config() diff --git a/vllm_ascend/core/scheduler_profiling_chunk.py b/vllm_ascend/core/scheduler_profiling_chunk.py index df9766e20ca..9f380def08c 100644 --- a/vllm_ascend/core/scheduler_profiling_chunk.py +++ b/vllm_ascend/core/scheduler_profiling_chunk.py @@ -59,19 +59,34 @@ def __init__( kv_cache_config: KVCacheConfig, structured_output_manager: StructuredOutputManager, block_size: int, + # `hash_block_size` was added in vLLM #40946; keep it optional so the + # subclass works on both pinned vllm and main. + hash_block_size: int | None = None, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, include_finished_set: bool = False, log_stats: bool = False, ) -> None: - super().__init__( - vllm_config, - kv_cache_config, - structured_output_manager, - block_size, - mm_registry=mm_registry, - include_finished_set=include_finished_set, - log_stats=log_stats, - ) + if vllm_version_is("0.19.1"): + super().__init__( + vllm_config, + kv_cache_config, + structured_output_manager, + block_size, + mm_registry=mm_registry, + include_finished_set=include_finished_set, + log_stats=log_stats, + ) + else: + super().__init__( + vllm_config, + kv_cache_config, + structured_output_manager, + block_size, + hash_block_size=hash_block_size, + mm_registry=mm_registry, + include_finished_set=include_finished_set, + log_stats=log_stats, + ) from vllm_ascend.ascend_config import get_ascend_config, init_ascend_config diff --git a/vllm_ascend/lora/utils.py b/vllm_ascend/lora/utils.py index d822b362ee4..5f3ab650f54 100755 --- a/vllm_ascend/lora/utils.py +++ b/vllm_ascend/lora/utils.py @@ -26,6 +26,7 @@ AscendRowParallelLinear, ) from vllm_ascend.ops.vocab_parallel_embedding import AscendVocabParallelEmbedding +from vllm_ascend.utils import vllm_version_is class AscendColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): @@ -185,15 +186,27 @@ def can_replace_layer( def refresh_all_lora_classes(): - vllm.lora.utils._all_lora_classes.add(AscendColumnParallelLinearWithLoRA) - vllm.lora.utils._all_lora_classes.add(AscendMergedColumnParallelLinearWithLoRA) - vllm.lora.utils._all_lora_classes.add(AscendRowParallelLinearWithLoRA) - vllm.lora.utils._all_lora_classes.add(AscendVocabParallelEmbeddingWithLoRA) - vllm.lora.utils._all_lora_classes.add(AscendQKVParallelLinearWithLoRA) - vllm.lora.utils._all_lora_classes.add(AscendMergedQKVParallelLinearWithLoRA) - vllm.lora.utils._all_lora_classes.add(AscendColumnParallelLinearWithShardedLoRA) - vllm.lora.utils._all_lora_classes.add(AscendMergedColumnParallelLinearWithShardedLoRA) - vllm.lora.utils._all_lora_classes.add(AscendMergedQKVParallelLinearWithShardedLoRA) - vllm.lora.utils._all_lora_classes.add(AscendQKVParallelLinearWithShardedLoRA) - vllm.lora.utils._all_lora_classes.add(AscendRowParallelLinearWithShardedLoRA) - vllm.lora.utils._all_lora_classes.add(AscendReplicatedLinearWithLoRA) + ascend_classes = ( + AscendColumnParallelLinearWithLoRA, + AscendMergedColumnParallelLinearWithLoRA, + AscendRowParallelLinearWithLoRA, + AscendVocabParallelEmbeddingWithLoRA, + AscendQKVParallelLinearWithLoRA, + AscendMergedQKVParallelLinearWithLoRA, + AscendColumnParallelLinearWithShardedLoRA, + AscendMergedColumnParallelLinearWithShardedLoRA, + AscendMergedQKVParallelLinearWithShardedLoRA, + AscendQKVParallelLinearWithShardedLoRA, + AscendRowParallelLinearWithShardedLoRA, + AscendReplicatedLinearWithLoRA, + ) + if vllm_version_is("0.19.1"): + for cls in ascend_classes: + vllm.lora.utils._all_lora_classes.add(cls) + else: + # vLLM #35077 changed _all_lora_classes from set to ordered tuple. + # Append the Ascend classes in a deterministic order. + vllm.lora.utils._all_lora_classes = ( + *vllm.lora.utils._all_lora_classes, + *ascend_classes, + ) diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index f9f993fd077..373421e4523 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -21,7 +21,6 @@ import torch import torch.nn.functional as F import torch_npu -from vllm._aiter_ops import rocm_aiter_ops from vllm.config import get_current_vllm_config from vllm.distributed import get_dp_group, get_ep_group, get_tp_group, tensor_model_parallel_all_reduce from vllm.forward_context import get_forward_context @@ -29,8 +28,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsCapturer -from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner # type: ignore -from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner # type: ignore import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config @@ -42,6 +40,7 @@ from vllm_ascend.ops.fused_moe.moe_comm_method import AllGatherCommImpl, FusedExpertsResult, setup_moe_comm_method from vllm_ascend.ops.fused_moe.moe_runtime_args import build_fused_experts_input from vllm_ascend.quantization.methods.base import get_moe_num_logical_experts +from vllm_ascend.quantization.methods.base import get_moe_num_logical_experts from vllm_ascend.quantization.quant_type import QuantType from vllm_ascend.utils import ( ACL_FORMAT_FRACTAL_NZ, @@ -84,6 +83,11 @@ def __init__(self, moe: FusedMoEConfig = None): def is_monolithic(self) -> bool: return False + def maybe_make_prepare_finalize(self, routing_tables=None): + # Ascend uses its own MoE communication and forward_impl path. + # Do not let upstream modular-kernel initialization replace it. + return None + def process_weights_after_loading(self, layer): super(UnquantizedFusedMoEMethod, self).process_weights_after_loading(layer) @@ -229,8 +233,7 @@ def apply( return final_hidden_states -# Please remove this inheritance after extending vllm, todo(wxs) -class AscendMoERunner(DefaultMoERunner): +class AscendMoERunner(MoERunner): @property def use_dp_chunking(self) -> bool: """Ascend uses its own forward_impl path, not the FlashInfer Cutlass @@ -256,12 +259,13 @@ def forward_impl( # The torch op expects the same return type based on whether it's moe_forward or moe_forward_shared return result - def forward_dispatch( + def _forward_impl( self, layer: torch.nn.Module, hidden_states: torch.Tensor, router_logits: torch.Tensor, shared_experts_input: torch.Tensor | None, + input_ids: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: with self._sequence_parallel_context(): return self.forward_impl( @@ -278,7 +282,11 @@ class AscendFusedMoE(FusedMoE): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - + self.use_overlapped = True + self._routed_input_transform = kwargs.get("routed_input_transform") + self._shared_experts = kwargs.get("shared_experts") + self.shared_expert_stream = None + has_shared_experts = self._shared_experts is not None num_experts = kwargs["num_experts"] intermediate_size = kwargs["intermediate_size"] num_shared_experts = kwargs.get("n_shared_experts", 0) @@ -295,6 +303,12 @@ def __init__(self, *args, **kwargs): self.quant_method = self.quant_config.get_quant_method(self, self.layer_name) assert self.quant_method is not None + # Keep base_quant_method in sync with the swapped-in Ascend method, + # otherwise FusedMoE.maybe_init_modular_kernel (called via the V2 + # model runner's prepare_communication_buffer_for_model) would dispatch + # to the upstream UnquantizedFusedMoEMethod.maybe_make_prepare_finalize, + # which raises by design. + self.base_quant_method = self.quant_method self.moe_config.tp_group = get_tp_group() self.moe_config.dp_group = get_dp_group() @@ -302,6 +316,11 @@ def __init__(self, *args, **kwargs): self.moe_config.mc2_group = get_mc2_group() self.moe_config.supports_eplb = self.quant_method.supports_eplb ascend_config = get_ascend_config() + self.multistream_overlap_shared_expert = ascend_config.multistream_overlap_shared_expert and has_shared_experts + self.shared_multistream_overlap_gate = ascend_config.multistream_overlap_gate and has_shared_experts + if enable_sp() and has_shared_experts: + logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.") + # flashcommon3 gate stream self.multistream_overlap_gate = ascend_config.multistream_overlap_gate if self.multistream_overlap_gate and AscendFusedMoE.gate_stream is None: @@ -311,6 +330,7 @@ def __init__(self, *args, **kwargs): self.e_score_correction_bias.data = self.e_score_correction_bias.data.to( dtype=vllm_config.model_config.dtype ) + self._gate = kwargs.get("gate") # init moe eplb_config = ascend_config.eplb_config @@ -371,13 +391,65 @@ def __init__(self, *args, **kwargs): self.moe_config, self.router, self._routed_input_transform, - kwargs.pop("gate", None), - kwargs.pop("shared_experts", None), + self._gate, + self._shared_experts, self.quant_method, - self.reduce_results, self.vllm_config.parallel_config.enable_dbo, ) + if self.multistream_overlap_shared_expert: + # Wrap the quant_method's process_weights_after_loading to validate that + # splitting shared expert computation (gate_up projection + activation, + # then down projection) yields identical results to integrated + # computation after weight loading. + original_process_weights = self.quant_method.process_weights_after_loading + + @wraps(original_process_weights) + def wrapped_process_weights(*args, **kwargs): + result = original_process_weights(*args, **kwargs) + self._validate_shared_expert_consistency() + return result + + self.quant_method.process_weights_after_loading = wrapped_process_weights # type: ignore + + def _validate_shared_expert_consistency(self): + """Validate that split shared expert computation matches integrated + computation.""" + test_input = ( + torch.rand(10, self.hidden_size, device="npu", dtype=self.moe_config.in_dtype) * 2 - 1 + ) # Random input for testing, scoped to [-1, 1] + + assert self._shared_experts is not None + integrated_out = self._shared_experts(test_input) + part1_out = self._shared_experts_part1(test_input) + split_out = self._shared_experts_part2(test_input, part1_out) + + if not torch.allclose(integrated_out, split_out): + diff = (integrated_out - split_out).abs() + logger.error("FusedMoE shared experts split computation does not match the integrated computation.") + logger.error("Max absolute difference: %s", diff.max().item()) + logger.error( + "Integrated output - sum: %s, norm: %s", integrated_out.sum().item(), integrated_out.norm().item() + ) + logger.error("Split output - sum: %s, norm: %s", split_out.sum().item(), split_out.norm().item()) + raise ValueError("FusedMoE shared experts split computation does not match the integrated computation.") + logger.info_once("FusedMoE shared experts split computation matches the integrated computation.") + + def _shared_experts_part1(self, hidden_states: torch.Tensor): + shared_gate_up, _ = self._shared_experts.gate_up_proj(hidden_states) # type: ignore + return shared_gate_up + + def _shared_experts_part2(self, hidden_states: torch.Tensor, shared_gate_up: torch.Tensor): + shared_act = self._shared_experts.act_fn(shared_gate_up) # type: ignore + shared_out, _ = self._shared_experts.down_proj(shared_act) # type: ignore + + # Qwen3-Next specific gating mechanism + assert self._shared_experts is not None + if hasattr(self._shared_experts, "expert_gate") and self._shared_experts.expert_gate is not None: + gate_out, _ = self._shared_experts.expert_gate(hidden_states) # type: ignore + shared_out = F.sigmoid(gate_out) * shared_out + return shared_out + def _get_quant_type(self) -> QuantType: quant_type = QuantType.NONE method = getattr(self.quant_method, "quant_method", None) @@ -408,6 +480,21 @@ def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tens """ return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(final_hidden_states) + @property + def gate(self) -> torch.nn.Module | None: + return self._gate if self.use_overlapped else None + + @property + def is_internal_router(self) -> bool: + return False + + @property + def use_dp_chunking(self) -> bool: + """This func routes to the chunked forward path using the FlashInfer Cutlass kernel + only when data parallelism (DP) is enabled. Thus just returning False in vllm-ascend + """ + return False + def forward( self, hidden_states: torch.Tensor, @@ -537,9 +624,10 @@ def forward_impl( # type: ignore[override] self.load_counter.add_(1) else: self.moe_load.add_(local_load) + routed_out = _EXTRA_CTX.moe_comm_method.finalize( hidden_states=fused_experts_results.routed_out, - reduce_results=self.reduce_results, + reduce_results=isinstance(_EXTRA_CTX.moe_comm_method, AllGatherCommImpl), padded_hidden_states_shape=padded_hidden_states_shape, ) @@ -553,140 +641,6 @@ def forward_impl( # type: ignore[override] # The vLLM FusedMoE forward_impl does not return events. return routed_out - -class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE): - def __init__( - self, - shared_experts: torch.nn.Module, - gate: torch.nn.Module | None = None, - use_overlapped: bool = True, - routed_input_transform: torch.nn.Module | None = None, - **kwargs, - ): - ascend_config = get_ascend_config() - # TODO: Enabling the mix placement in deepseek_v2.py - # remove this part after the mix placement merged into vllm - # https://github.com/vllm-project/vllm/pull/31256 - if ascend_config.mix_placement: - rocm_aiter_ops.is_fusion_moe_shared_experts_enabled = mock_false - rocm_aiter_ops.is_fused_moe_enabled = mock_false - AscendFusedMoE.__init__(self, **kwargs) - if ascend_config.mix_placement: - rocm_aiter_ops.is_fusion_moe_shared_experts_enabled = mock_true - rocm_aiter_ops.is_fused_moe_enabled = mock_true - - self._routed_input_transform = routed_input_transform - self._shared_experts = shared_experts - self.use_overlapped = use_overlapped - self.shared_expert_stream = None - has_shared_experts = shared_experts is not None - self.multistream_overlap_shared_expert = ascend_config.multistream_overlap_shared_expert and has_shared_experts - self.multistream_overlap_gate = ascend_config.multistream_overlap_gate and has_shared_experts - if enable_sp(): - logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.") - - self._gate = gate - # Recreate the runner with the correct shared_experts parameter. - # The parent class created the runner before self._shared_experts was set. - # NOTE: must use self._shared_experts here, not self.shared_experts — - # FusedMoE.shared_experts is a property that reads self.runner.shared_experts, - # which at this point is still the stale runner built with shared_experts=None. - self.runner = AscendMoERunner( - self.layer_name, - self.moe_config, - self.router, - self._routed_input_transform, - self.gate, - self._shared_experts, - self.quant_method, - self.reduce_results, - self.vllm_config.parallel_config.enable_dbo, - ) - - if self.multistream_overlap_shared_expert: - # Wrap the quant_method's process_weights_after_loading to validate that - # splitting shared expert computation (gate_up projection + activation, - # then down projection) yields identical results to integrated - # computation after weight loading. - original_process_weights = self.quant_method.process_weights_after_loading - - @wraps(original_process_weights) - def wrapped_process_weights(*args, **kwargs): - result = original_process_weights(*args, **kwargs) - self._validate_shared_expert_consistency() - return result - - self.quant_method.process_weights_after_loading = wrapped_process_weights # type: ignore - - def _shared_experts_part1(self, hidden_states: torch.Tensor): - shared_gate_up, _ = self._shared_experts.gate_up_proj(hidden_states) # type: ignore - return shared_gate_up - - def _shared_experts_part2(self, hidden_states: torch.Tensor, shared_gate_up: torch.Tensor): - shared_act = self._shared_experts.act_fn(shared_gate_up) # type: ignore - shared_out, _ = self._shared_experts.down_proj(shared_act) # type: ignore - - # Qwen3-Next specific gating mechanism - if hasattr(self._shared_experts, "expert_gate") and self._shared_experts.expert_gate is not None: - gate_out, _ = self._shared_experts.expert_gate(hidden_states) # type: ignore - shared_out = F.sigmoid(gate_out) * shared_out - return shared_out - - def _validate_shared_expert_consistency(self): - """Validate that split shared expert computation matches integrated - computation.""" - test_input = ( - torch.rand(10, self.hidden_size, device="npu", dtype=self.moe_config.in_dtype) * 2 - 1 - ) # Random input for testing, scoped to [-1, 1] - - integrated_out = self._shared_experts(test_input) - part1_out = self._shared_experts_part1(test_input) - split_out = self._shared_experts_part2(test_input, part1_out) - - if not torch.allclose(integrated_out, split_out): - diff = (integrated_out - split_out).abs() - logger.error("SharedFusedMoE shared experts split computation does not match the integrated computation.") - logger.error("Max absolute difference: %s", diff.max().item()) - logger.error( - "Integrated output - sum: %s, norm: %s", integrated_out.sum().item(), integrated_out.norm().item() - ) - logger.error("Split output - sum: %s, norm: %s", split_out.sum().item(), split_out.norm().item()) - raise ValueError( - "SharedFusedMoE shared experts split computation does not match the integrated computation." - ) - logger.info_once("SharedFusedMoE shared experts split computation matches the integrated computation.") - - @property - def gate(self) -> torch.nn.Module | None: - return self._gate if self.use_overlapped else None - - @property - def is_internal_router(self) -> bool: - return False - - @property - def use_dp_chunking(self) -> bool: - """This func routes to the chunked forward path using the FlashInfer Cutlass kernel - only when data parallelism (DP) is enabled. Thus just returning False in vllm-ascend - """ - return False - - def forward( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - result = AscendFusedMoE.forward( - self, - hidden_states=hidden_states, - router_logits=router_logits, - ) - # When shared experts are absent, the parent returns only fused_out; - # otherwise it returns a (shared_out, fused_out) tuple. - if self._shared_experts is None: - return None, result - return result - def _forward_shared_experts(self, hidden_states: torch.Tensor, fused_moe_evts: FusedMoEEvents): if self._shared_experts is None: return None @@ -722,15 +676,15 @@ def maybe_wait_event(evt: torch.npu.Event | None): shared_out = tensor_model_parallel_all_reduce(shared_out) return shared_out - def forward_impl( # type: ignore[override] + def shared_forward_impl( # type: ignore[override] self, hidden_states: torch.Tensor, router_logits: torch.Tensor ): - if self.multistream_overlap_gate: + if self.shared_multistream_overlap_gate: set_flash_common3_context(shared_experts=self._shared_experts) before_routed_experts = torch.npu.current_stream().record_event() - fused_moe_results = AscendFusedMoE.forward_impl( - self, + + fused_moe_results = self.forward_impl( hidden_states=hidden_states, router_logits=router_logits, return_with_event=True, @@ -740,7 +694,7 @@ def forward_impl( # type: ignore[override] if self._shared_experts is None: return routed_out - if self.multistream_overlap_gate: + if self.shared_multistream_overlap_gate: fc3_context = get_flash_common3_context() assert fc3_context is not None shared_out = fc3_context.shared_out @@ -753,5 +707,4 @@ def forward_impl( # type: ignore[override] before_combine=fused_moe_results.before_combine_evt, ), ) - return shared_out, routed_out diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py index 013076ebc88..2e057efaa3f 100644 --- a/vllm_ascend/ops/layernorm.py +++ b/vllm_ascend/ops/layernorm.py @@ -166,12 +166,24 @@ def __init__( norm_before_gate: bool = False, device: torch.device | None = None, dtype: torch.dtype | None = None, + # `activation` was added in vLLM #40245 (Qwen3-Next/GDN). Accept and + # forward it; older vllm versions did not pass this kwarg so the + # default keeps existing behavior. + activation: str = "swish", ): """If group_size is not None, we do GroupNorm with each group having group_size elements. group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group). """ factory_kwargs = {"device": device, "dtype": dtype} - super().__init__(hidden_size, eps, group_size, norm_before_gate, device, dtype) + super().__init__( + hidden_size, + eps, + group_size, + norm_before_gate, + device, + dtype, + activation=activation, + ) self.eps = eps self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) self.register_parameter("bias", None) diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index eb71196018f..b406303c775 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -193,6 +193,25 @@ # Remove this patch once the runtime vLLM version contains the GLM parser # and streaming finish-chunk fixes. # +# ** 10a. File: platform/patch_kv_cache_utils.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.v1.core.kv_cache_utils.resolve_kv_cache_block_sizes` +# `vllm.v1.engine.core.resolve_kv_cache_block_sizes` +# Why: +# vLLM PR #40860 added a restriction that hybrid KV cache groups with +# multiple block sizes do not support context parallelism (dcp/pcp > 1). +# This restriction is correct for CUDA but not for Ascend, which +# implements context parallelism for MLA and SWA-MLA layers separately. +# How: +# Monkey-patch resolve_kv_cache_block_sizes to handle the multiple-groups +# + CP case by returning lcm(block_sizes) * dcp * pcp as scheduler_block_size +# instead of raising ValueError. +# Related PR (if no, explain why): +# vLLM PR #40860 ([Feat] DeepSeek V4 Rebased). +# Future Plan: +# Remove this patch once upstream vLLM supports hybrid KV cache + CP for +# non-CUDA backends, or exposes a platform hook for this behavior. +# # ** 10. File: platform/patch_kv_cache_interface.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.kv_cache_interface.MLAAttentionSpec` diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py index 9a793352e2f..329aadf2d95 100644 --- a/vllm_ascend/patch/platform/__init__.py +++ b/vllm_ascend/patch/platform/__init__.py @@ -18,6 +18,7 @@ import vllm_ascend.patch.platform.patch_distributed # noqa import vllm_ascend.patch.platform.patch_kv_cache_interface # noqa +import vllm_ascend.patch.platform.patch_kv_cache_utils # noqa from vllm_ascend import envs from vllm_ascend.utils import is_310p diff --git a/vllm_ascend/patch/platform/patch_kv_cache_utils.py b/vllm_ascend/patch/platform/patch_kv_cache_utils.py new file mode 100644 index 00000000000..20c09004412 --- /dev/null +++ b/vllm_ascend/patch/platform/patch_kv_cache_utils.py @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM-Ascend project +import math + +import vllm.v1.core.kv_cache_utils +from vllm.config import VllmConfig +from vllm.v1.kv_cache_interface import KVCacheConfig + +_orig_resolve_kv_cache_block_sizes = vllm.v1.core.kv_cache_utils.resolve_kv_cache_block_sizes + + +def _ascend_resolve_kv_cache_block_sizes( + kv_cache_config: KVCacheConfig, + vllm_config: VllmConfig, +) -> tuple[int, int]: + """Ascend-compatible resolve_kv_cache_block_sizes. + + vLLM PR #40860 added a restriction that hybrid KV cache groups with + multiple block sizes do not support context parallelism (dcp/pcp > 1). + This restriction is correct for CUDA but not for Ascend, which implements + context parallelism for MLA and SWA-MLA layers independently. + + For multiple KV cache groups with CP, compute scheduler_block_size as + lcm(group_block_sizes) * dcp * pcp to maintain alignment, consistent + with the pre-PR-#40860 behavior of block_size * dcp * pcp. + """ + cache_config = vllm_config.cache_config + dcp = vllm_config.parallel_config.decode_context_parallel_size + pcp = vllm_config.parallel_config.prefill_context_parallel_size + groups = kv_cache_config.kv_cache_groups + + if len(groups) <= 1: + bs = cache_config.block_size * dcp * pcp + return bs, bs + + if dcp != 1 or pcp != 1: + # Ascend supports CP with multiple KV cache groups; compute + # scheduler_block_size using the LCM of all group block sizes + # multiplied by the CP factors for proper alignment. + group_block_sizes = [g.kv_cache_spec.block_size for g in groups] + scheduler_block_size = math.lcm(*group_block_sizes) * dcp * pcp + return scheduler_block_size, scheduler_block_size + + return _orig_resolve_kv_cache_block_sizes(kv_cache_config, vllm_config) + + +vllm.v1.core.kv_cache_utils.resolve_kv_cache_block_sizes = _ascend_resolve_kv_cache_block_sizes + +# Also patch the reference used by engine/core.py which imports the function directly. +import vllm.v1.engine.core # noqa: E402 + +vllm.v1.engine.core.resolve_kv_cache_block_sizes = _ascend_resolve_kv_cache_block_sizes diff --git a/vllm_ascend/sample/rejection_sampler.py b/vllm_ascend/sample/rejection_sampler.py index 358c294ebe0..26c0bf395ac 100644 --- a/vllm_ascend/sample/rejection_sampler.py +++ b/vllm_ascend/sample/rejection_sampler.py @@ -94,6 +94,8 @@ def rejection_sample( # [batch_size, 1] bonus_token_ids: torch.Tensor, sampling_metadata: SamplingMetadata, + synthetic_mode: bool = False, + synthetic_conditional_rates: torch.Tensor | None = None, ) -> torch.Tensor: assert draft_token_ids.ndim == 1 assert draft_probs is None or draft_probs.ndim == 2 diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index b39a9f27cce..bdf4a9e1945 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -624,7 +624,7 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None): from vllm_ascend.ops.activation import AscendQuickGELU, AscendSiluAndMul from vllm_ascend.ops.conv import AscendConv3dLayer - from vllm_ascend.ops.fused_moe.fused_moe import AscendFusedMoE, AscendSharedFusedMoE + from vllm_ascend.ops.fused_moe.fused_moe import AscendFusedMoE from vllm_ascend.ops.gdn import AscendGatedDeltaNetAttention from vllm_ascend.ops.layernorm import AscendGemmaRMSNorm, AscendRMSNorm, AscendRMSNormGated from vllm_ascend.ops.linear import ( @@ -670,7 +670,6 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None): "RMSNorm": AscendRMSNorm, "GemmaRMSNorm": AscendGemmaRMSNorm, "FusedMoE": AscendFusedMoE, - "SharedFusedMoE": AscendSharedFusedMoE, "MultiHeadLatentAttentionWrapper": AscendMultiHeadLatentAttention, "MMEncoderAttention": AscendMMEncoderAttention, "ApplyRotaryEmb": AscendApplyRotaryEmb, @@ -683,7 +682,7 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None): # 310P: override selected ops with 310P implementations (keep minimal changes outside _310p) if is_310p(): - from vllm_ascend._310p.fused_moe.fused_moe import AscendFusedMoE310, AscendSharedFusedMoE310 + from vllm_ascend._310p.fused_moe.fused_moe import AscendFusedMoE310 from vllm_ascend._310p.ops.activation import AscendSiluAndMul310 from vllm_ascend._310p.ops.conv import AscendConv3dLayer310 from vllm_ascend._310p.ops.fla.gdn_310 import AscendGatedDeltaNetAttention310 @@ -707,7 +706,6 @@ def register_ascend_customop(vllm_config: VllmConfig | None = None): "GemmaRMSNorm": AscendGemmaRMSNorm310, "RMSNormGated": AscendRMSNormGated310, "FusedMoE": AscendFusedMoE310, - "SharedFusedMoE": AscendSharedFusedMoE310, "ParallelLMHead": AscendParallelLMHead310, "VocabParallelEmbedding": AscendVocabParallelEmbedding310, "MMEncoderAttention": AscendMMEncoderAttention310, diff --git a/vllm_ascend/worker/v2/model_runner.py b/vllm_ascend/worker/v2/model_runner.py index 1616843a1d0..a2903dca9c4 100644 --- a/vllm_ascend/worker/v2/model_runner.py +++ b/vllm_ascend/worker/v2/model_runner.py @@ -304,6 +304,16 @@ def prepare_inputs( input_ids = self.input_buffers.input_ids[:num_tokens_after_padding] positions = self.input_buffers.positions[:num_tokens_after_padding] + # CPU upper bound on seq_lens (num_computed_tokens + num_scheduled_tokens). + # Added by vLLM PR #40654 to avoid GPU->CPU sync for seq_lens. + seq_lens_cpu_upper_bound_np = np.zeros(num_reqs_padded, dtype=np.int32) + np.add( + self.req_states.num_computed_tokens_np[idx_mapping_np], + num_scheduled_tokens, + out=seq_lens_cpu_upper_bound_np[:num_reqs], + ) + seq_lens_cpu_upper_bound = torch.from_numpy(seq_lens_cpu_upper_bound_np) + self.input_batch = AscendInputBatch( req_ids=req_ids, num_reqs=num_reqs, @@ -319,6 +329,7 @@ def prepare_inputs( query_start_loc=query_start_loc, query_start_loc_np=query_start_loc_np, seq_lens=seq_lens, + seq_lens_cpu_upper_bound=seq_lens_cpu_upper_bound, dcp_local_seq_lens=None, # TODO(Ronald1995): support cp. input_ids=input_ids, positions=positions, diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index 4a2f84ad798..722ec077021 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -44,10 +44,7 @@ from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput from vllm.v1.worker.gpu_worker import AsyncIntermediateTensors -from vllm.v1.worker.worker_base import ( - CompilationTimes, # noqa: E402 - WorkerBase, -) +from vllm.v1.worker.worker_base import CompilationTimes, WorkerBase from vllm.v1.worker.workspace import init_workspace_manager import vllm_ascend.envs as envs_ascend @@ -473,7 +470,7 @@ def load_model(self) -> None: with context, set_current_vllm_config(self.vllm_config): self.model_runner.load_model() - def compile_or_warm_up_model(self): + def compile_or_warm_up_model(self) -> CompilationTimes: # Note: need to adapt for graph mode. warmup_sizes = (self.vllm_config.compilation_config.compile_sizes or []).copy() if not self.model_config.enforce_eager: @@ -553,10 +550,15 @@ def compile_or_warm_up_model(self): # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. set_random_seed(self.model_config.seed) - return CompilationTimes( language_model=self.vllm_config.compilation_config.compilation_time, - encoder=self.compilation_config.encoder_compilation_time, + # `encoder_compilation_time` was added after v0.19.1 (vLLM #39240); fall + # back to 0.0 so the older release still constructs CompilationTimes. + encoder=getattr( + self.vllm_config.compilation_config, + "encoder_compilation_time", + 0.0, + ), ) def _warm_up_atb(self):