From 4aeabf2aa4272fbf5f58ab82f4e87e174999ffc5 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 13 Jan 2026 18:44:16 -0500 Subject: [PATCH 001/191] initial MoERunner refactor Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 2 + .../fused_moe/runner/default_moe_runner.py | 701 ++++++++++-------- 2 files changed, 406 insertions(+), 297 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 75283b9bbe39..2f704569209c 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -504,6 +504,8 @@ def __init__( self.apply_router_weight_on_input = apply_router_weight_on_input self.activation = MoEActivation.from_str(activation) + # TODO(bnell): we should not have to create a router if the kernel is + # monolithic. self.router = create_fused_moe_router( top_k=top_k, global_num_experts=self.global_num_experts, diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index 12b560493fa2..bb1ac465b1c5 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable from contextlib import nullcontext from typing import TYPE_CHECKING @@ -82,9 +83,23 @@ def _moe_forward( layer = get_layer_from_name(_resolve_layer_name(layer_name)) # TODO(bnell): this can be removed after MK migration is complete. layer.ensure_moe_quant_config_init() - return layer.runner.forward_impl( - layer, hidden_states, router_logits, shared_experts_input - ) + runner = layer.runner + router_logits = runner._maybe_gate(hidden_states, router_logits) + with runner._sequence_parallel_context(): + if runner.use_dp_chunking: + return runner.forward_impl_chunked( + layer, + hidden_states, + router_logits, + shared_experts_input, + ) + else: + return runner.forward_impl( + layer, + hidden_states, + router_logits, + shared_experts_input, + ) def _moe_forward_fake( @@ -105,9 +120,23 @@ def _moe_forward_shared( layer = get_layer_from_name(_resolve_layer_name(layer_name)) # TODO(bnell): this can be removed after MK migration is complete. layer.ensure_moe_quant_config_init() - return layer.runner.forward_impl( - layer, hidden_states, router_logits, shared_experts_input - ) + runner = layer.runner + router_logits = runner._maybe_gate(hidden_states, router_logits) + with runner._sequence_parallel_context(): + if runner.use_dp_chunking: + return runner.forward_impl_chunked( + layer, + hidden_states, + router_logits, + shared_experts_input, + ) + else: + return runner.forward_impl( + layer, + hidden_states, + router_logits, + shared_experts_input, + ) def _moe_forward_shared_fake( @@ -191,10 +220,17 @@ def __init__( self.reduce_results = reduce_results self.enable_dbo = enable_dbo + # Chunked all2all staging tensor + # TODO(bnell) rename these? + self.batched_hidden_states: torch.Tensor | None = None + self.batched_router_logits: torch.Tensor | None = None + self._maybe_init_dp_chunking() + # Allow disabling of the separate shared experts stream for # debug purposes. # TODO: Remove this after more extensive testings with TP/DP # and other execution modes + self.use_shared_experts_stream = False if envs.VLLM_DISABLE_SHARED_EXPERTS_STREAM: logger.debug_once("Disabling MoE shared_experts cuda stream", scope="local") self.shared_experts_stream = None @@ -210,24 +246,22 @@ def __init__( # Needed for string -> FusedMoE layer lookup in custom ops. self.layer_name = layer.layer_name + self.moe_forward = self._select_forward(layer) + + def _select_forward(self, layer: torch.nn.Module) -> Callable: if current_platform.is_tpu() or current_platform.is_cpu(): # TODO: Once the OOM issue for the TPU backend is resolved, we # will switch to using the moe_forward custom op. # Note: CPU doesn't require wrapped forward_impl. - if self.shared_experts is None: - self.moe_forward = _moe_forward - else: - self.moe_forward = _moe_forward_shared - else: - if self.shared_experts is None: - self.moe_forward = torch.ops.vllm.moe_forward - else: - self.moe_forward = torch.ops.vllm.moe_forward_shared + return _moe_forward if self.shared_experts is None else _moe_forward_shared - # Chunked all2all staging tensor - self.batched_hidden_states: torch.Tensor | None = None - self.batched_router_logits: torch.Tensor | None = None + return ( + torch.ops.vllm.moe_forward + if self.shared_experts is None + else torch.ops.vllm.moe_forward_shared + ) + # TODO(bnell): make this a member var? @property def use_dp_chunking(self) -> bool: return ( @@ -241,22 +275,9 @@ def _maybe_setup_shared_experts_stream( self, hidden_states: torch.Tensor, shared_input: torch.Tensor | None, - has_separate_shared_experts: bool, - use_chunked_impl: bool, - ) -> tuple[bool, torch.Tensor | None]: - use_shared_experts_stream = ( - current_platform.is_cuda() - and has_separate_shared_experts - and not use_chunked_impl - and self.shared_experts_stream is not None - and ( - hidden_states.shape[0] - <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD - ) - ) - - shared_experts_input: torch.Tensor | None = None - if use_shared_experts_stream: + ) -> torch.Tensor | None: + hidden_states_clone: torch.Tensor | None = None + if self.use_shared_experts_stream: assert self.shared_experts_stream is not None assert self.moe_config.disable_inplace @@ -278,12 +299,13 @@ def _maybe_setup_shared_experts_stream( assert self.shared_experts_stream is not None self.shared_experts_stream.wait_stream(current_stream()) - return use_shared_experts_stream, shared_experts_input + return shared_experts_input - def ensure_dp_chunking_init(self): - if not self.use_dp_chunking or self.batched_hidden_states is not None: + def _maybe_init_dp_chunking(self): + if not self.use_dp_chunking: return + assert self.batched_hidden_states is None states_shape: tuple[int, ...] logits_shape: tuple[int, ...] @@ -309,6 +331,38 @@ def ensure_dp_chunking_init(self): device=device, ) + @property + def has_separate_shared_experts(self) -> bool: + return ( + not self.quant_method.mk_owns_shared_expert + and self.shared_experts is not None + ) + + def _apply_shared_experts( + self, + hidden_states: torch.Tensor, + allow_streaming: bool = False, + ) -> torch.Tensor | None: + shared_output: torch.Tensor | None = None + if self.has_separate_shared_experts: + assert self.shared_experts is not None + + if self.use_shared_experts_stream and allow_streaming: + # Run shared experts in parallel on a separate stream + # NOTE: We start the separate stream here and mark the + # sync end point immediately after it is done. This is + # important to avoid excessive stream allocations by the cuda + # graph replay later. + with torch.cuda.stream(self.shared_experts_stream): + # Note that hidden_states clone() is necessary here to avoid + # conflict with the main stream + shared_output = self.shared_experts(hidden_states) + current_stream().wait_stream(self.shared_experts_stream) + else: + shared_output = self.shared_experts(hidden_states) + + return shared_output + def must_reduce_shared_expert_outputs(self) -> bool: """ The shared_experts are typically computed using the RowParallelLinear @@ -322,7 +376,6 @@ def must_reduce_shared_expert_outputs(self) -> bool: Therefore it is required that we reduce the shared_experts output early. """ - assert self.quant_method is not None return ( self.quant_method.moe_kernel is not None and self.quant_method.moe_kernel.output_is_reduced() @@ -357,7 +410,7 @@ def apply_routed_input_transform(self, hidden_states: torch.Tensor) -> torch.Ten return result return hidden_states - def _reduce_output( + def _maybe_reduce_output( self, states: torch.Tensor | tuple[torch.Tensor, torch.Tensor], trunc_sizes: list[int], @@ -397,23 +450,12 @@ def _encode_layer_name(self) -> str | ModuleName: return "from_forward_context" return self.layer_name - def forward( + def _maybe_pad_hidden_states( self, + original_hidden_states: torch.Tensor | None, hidden_states: torch.Tensor, - router_logits: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - # For latent MoE: save ORIGINAL hidden_states before transform - # (shared_experts need original dimension, routed experts use transformed) - if self.shared_experts is not None: - original_hidden_states = hidden_states - original_hidden_dim = hidden_states.shape[-1] - else: - original_hidden_states = None - - # Apply transform for routed experts (e.g., latent projection for latent MoE) - hidden_states = self.apply_routed_input_transform(hidden_states) - - # This is the dimension after transform (for routed expert output slicing) + ) -> tuple[torch.Tensor, list[int]]: + original_hidden_dim = original_hidden_states.shape[-1] if original_hidden_states is not None else 0 transformed_hidden_dim = hidden_states.shape[-1] if ( not self.quant_method.skip_forward_padding @@ -426,134 +468,269 @@ def forward( value=0.0, ) - fused_output = self.moe_forward( - hidden_states, - router_logits, - original_hidden_states, - self._encode_layer_name(), - ) - if self.shared_experts is not None: orig_hidden_dims = [original_hidden_dim, transformed_hidden_dim] else: orig_hidden_dims = [transformed_hidden_dim] - return self._reduce_output(fused_output, orig_hidden_dims) + return hidden_states, orig_hidden_dims - def forward_impl_chunked( + def _apply_quant_method( self, layer: torch.nn.Module, - full_hidden_states: torch.Tensor, - full_router_logits: torch.Tensor, - full_shared_input: torch.Tensor | None, - has_separate_shared_experts: bool, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + hidden_states: torch.Tensor, + extra_tensor: torch.Tensor | None, + router_logits: torch.Tensor, + shared_input: torch.Tensor | None, + run_shared_experts_before: bool = True, + ) -> tuple[torch.Tensor | None, torch.Tensor]: + shared_output: torch.Tensor | None = None + + # Run this before quant_method to avoid inplace issues. + if run_shared_experts_before: + shared_input = shared_input if shared_input is not None else hidden_states + shared_output = self._apply_shared_experts( + shared_input, + False, + ) + else: + hidden_states_clone = self._maybe_setup_shared_experts_stream( + hidden_states, + shared_input, + ) + + # TODO(bnell): deal with fp4 flashinfer tuple hidden states hack (#30014). + # Figure out nicer way to do this. + x_arg = hidden_states if extra_tensor is None else (hidden_states, extra_tensor) + + if self.quant_method.is_monolithic: + result = self.quant_method.apply_monolithic( + layer=layer, + x=x_arg, + router_logits=router_logits, + ) + else: + topk_weights, topk_ids = self.router.select_experts( + hidden_states=hidden_states, + router_logits=router_logits, + ) + + result = self.quant_method.apply( + layer=layer, + x=x_arg, + topk_weights=topk_weights, + topk_ids=topk_ids, + shared_experts_input=shared_input, + ) + + if isinstance(result, tuple): + assert shared_output is None + shared_output, hidden_states = result + else: + hidden_states = result + + if not run_shared_experts_before and self.has_separate_shared_experts: + assert shared_output is None + shared_output = self._apply_shared_experts( + hidden_states_clone, + True, + ) + + return shared_output, hidden_states + + def _sequence_parallel_context(self): + ctx = get_forward_context() + return ( + ctx.dp_metadata.sp_local_sizes(self.moe_config.sp_size) + if ctx.dp_metadata + else nullcontext() + ) + + def _allocate_dp_chunking_outputs( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> tuple[torch.Tensor | None, torch.Tensor]: + assert self.use_dp_chunking + + # Assert the inputs are of the proper type and shape. assert self.batched_hidden_states is not None assert self.batched_router_logits is not None - assert self.batched_hidden_states.dtype == full_hidden_states.dtype, ( - f"{self.batched_hidden_states.dtype} == {full_hidden_states.dtype}" + + assert self.batched_hidden_states.dtype == hidden_states.dtype, ( + f"{self.batched_hidden_states.dtype} == {hidden_states.dtype}" ) - assert self.batched_router_logits.dtype == full_router_logits.dtype, ( - f"{self.batched_router_logits.dtype} == {full_router_logits.dtype}" + assert self.batched_router_logits.dtype == router_logits.dtype, ( + f"{self.batched_router_logits.dtype} == {router_logits.dtype}" ) - # Check size compatibility. - assert self.batched_hidden_states.size(-1) == full_hidden_states.size(-1) - assert self.batched_router_logits.size(-1) == full_router_logits.size(-1) - # TODO(bnell): Fix shared_expert_inputs w/chunking. - # assert shared_input is None, ( - # "Routed input transform is not currently supported with DP chunking." - # ) + # Check size compatibility. + assert self.batched_hidden_states.size(-1) == hidden_states.size(-1) + assert self.batched_router_logits.size(-1) == router_logits.size(-1) - full_fused_final_hidden_states = torch.empty_like(full_hidden_states) + final_fused_hidden_states = torch.empty_like(hidden_states) if self.shared_experts is not None: - full_shared_final_hidden_states = torch.empty_like(full_hidden_states) - - def process_chunk(chunk_start, chunk_end, skip_result_store=False): - chunk_size = chunk_end - chunk_start - hidden_states = full_hidden_states[chunk_start:chunk_end, :] - router_logits = full_router_logits[chunk_start:chunk_end, :] - shared_input = ( - full_shared_input[chunk_start:chunk_end, :] - if full_shared_input is not None - else None + final_shared_hidden_states = torch.empty_like(hidden_states) + else: + final_shared_hidden_states = None + + return final_shared_hidden_states, final_fused_hidden_states + + def _maybe_gate( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> torch.Tensor: + # If router/gate provided, then apply it here. + # (Note: This code runs only when "overlapped mode" is on to allow + # parallel execution of shared experts with the FusedMoE via + # separate cuda stream) + if self.gate is not None: + router_logits, _ = self.gate(hidden_states) + return router_logits + + @property + def do_naive_dispatch_combine(self) -> bool: + return ( + self.moe_config.dp_size > 1 and not self.quant_method.supports_internal_mk + ) + + def _maybe_dispatch( + self, + layer: torch.nn.Module, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: + extra_tensor: torch.Tensor | None = None + + if self.do_naive_dispatch_combine: + post_quant_allgather = ( + self.moe_config.dp_size > 1 + and self.moe_config.use_ep + and getattr(self.quant_method, "do_post_quant_allgather", False) ) - assert self.batched_hidden_states is not None - assert self.batched_router_logits is not None - # This is only true when DBO has been enabled in the config. - # Both tensors will have an outer dimension for the ubatch id - if self.batched_hidden_states.dim() == 3: - assert self.batched_router_logits.dim() == 3 - batch_buffer_idx = dbo_current_ubatch_id() - batched_hidden_states = self.batched_hidden_states[batch_buffer_idx, :] - batched_router_logits = self.batched_router_logits[batch_buffer_idx, :] + extra_tensors: list[torch.Tensor] | None = None + + if post_quant_allgather: + hidden_states_to_dispatch, extra_tensors = ( + self.quant_method.prepare_dp_allgather_tensor( + layer, hidden_states, router_logits + ) + ) else: - batched_hidden_states = self.batched_hidden_states - batched_router_logits = self.batched_router_logits + hidden_states_to_dispatch = hidden_states - assert ( - batched_hidden_states.size(0) # type: ignore - >= chunk_size + result = get_ep_group().dispatch_router_logits( + hidden_states_to_dispatch, + router_logits, + self.moe_config.is_sequence_parallel, + extra_tensors=extra_tensors, ) - assert ( - batched_router_logits.size(0) # type: ignore - >= chunk_size + + if len(result) == 3: + hidden_states, router_logits, extra_tensors = result + assert isinstance(extra_tensors, list) and len(extra_tensors) == 1 + extra_tensor = extra_tensors[0] + else: + hidden_states, router_logits = result + + # NOTE: Similar with DP, PCP also needs dispatch and combine. For + # simplicity, AgRsAll2All was added separately for PCP here. Maybe + # we should modify All2AllManager abstraction to better support PCP. + # TODO(bnell): see what we can do here + if self.moe_config.pcp_size > 1: + hidden_states = get_pcp_group().all_gather( + hidden_states, + dim=0, ) - staged_hidden_states = batched_hidden_states[:chunk_size, :] # type: ignore - staged_router_logits = batched_router_logits[:chunk_size, :] # type: ignore - staged_hidden_states.copy_(hidden_states, non_blocking=True) - staged_router_logits.copy_(router_logits, non_blocking=True) + router_logits = get_pcp_group().all_gather( + router_logits, + dim=0, + ) + + return hidden_states, router_logits, extra_tensor - shared_input = ( - shared_input if shared_input is not None else staged_hidden_states + def _maybe_combine( + self, + shared_output: torch.Tensor | None, + hidden_states: torch.Tensor, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]: + if self.do_naive_dispatch_combine: + hidden_states = get_ep_group().combine( + hidden_states, self.moe_config.is_sequence_parallel ) - # Matrix multiply. - if self.quant_method.is_monolithic: - assert has_separate_shared_experts or self.shared_experts is None - final_hidden_states = self.quant_method.apply_monolithic( - layer=layer, - x=staged_hidden_states, - router_logits=staged_router_logits, - ) - else: - topk_weights, topk_ids = self.router.select_experts( - hidden_states=staged_hidden_states, - router_logits=staged_router_logits, - ) + if self.moe_config.pcp_size > 1: + hidden_states = get_pcp_group().reduce_scatter( + hidden_states, + dim=0, + ) + # need RS for shared_output? - final_hidden_states = self.quant_method.apply( - layer=layer, - x=staged_hidden_states, - topk_weights=topk_weights, - topk_ids=topk_ids, - shared_experts_input=shared_input, - ) + if self.shared_experts is not None: + assert shared_output is not None + return shared_output, hidden_states + else: + return hidden_states - if has_separate_shared_experts: - assert not isinstance(final_hidden_states, tuple) - assert self.shared_experts is not None + def forward( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + # For latent MoE: save ORIGINAL hidden_states before transform + # (shared_experts need original dimension, routed experts use transformed) + original_hidden_states = hidden_states - shared_output = self.shared_experts(shared_input) + # Apply transform for routed experts (e.g., latent projection for latent MoE) + hidden_states = self.apply_routed_input_transform(hidden_states) - final_hidden_states = ( - shared_output, - final_hidden_states, - ) + hidden_states, og_hidden_dims = self._maybe_pad_hidden_states( + original_hidden_states, + hidden_states, + ) - if not skip_result_store: - if self.shared_experts is None: - full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_( - final_hidden_states, non_blocking=True - ) - else: - full_shared_final_hidden_states[chunk_start:chunk_end, :].copy_( - final_hidden_states[0], non_blocking=True - ) - full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_( - final_hidden_states[1], non_blocking=True - ) + fused_output = self.moe_forward( + hidden_states, + router_logits, + original_hidden_states, + self._encode_layer_name(), + ) + + return self._maybe_reduce_output(fused_output, og_hidden_dims) + + # TODO: avoid some of the copying by disabling inplace? + def _slice_and_copy_input( + self, + out_slice: torch.Tensor, + orig: torch.Tensor | None, + start: int, + end: int, + ) -> torch.Tensor: + assert orig is not None + slice_size = end - start + orig_slice = orig[start:end, :] + if self.enable_dbo: + assert out_slice.dim() == 3 + batch_buffer_idx = dbo_current_ubatch_id() + out_slice = out_slice[batch_buffer_idx, :] + + assert out_slice.size(0) >= slice_size + out_slice = out_slice[:slice_size, :] + out_slice.copy_(orig_slice, non_blocking=True) + return out_slice + + def forward_impl_chunked( + self, + layer: torch.nn.Module, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_input: torch.Tensor | None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + final_shared_hidden_states, final_fused_hidden_states = ( + self._allocate_dp_chunking_outputs(hidden_states, router_logits) + ) ctx = get_forward_context() # flashinfer_cutlass_kernels can handle: optional DP + TP/EP @@ -567,7 +744,7 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False): max_tokens_across_dispatchers, self.moe_config.sp_size ) - num_tokens = full_hidden_states.size(0) + num_tokens = hidden_states.size(0) for chunk_idx, chunk_start_ in enumerate( range(0, max_tokens_across_dispatchers, moe_dp_chunk_size_per_rank) ): @@ -578,17 +755,52 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False): # clamp start and end chunk_start = min(chunk_start, num_tokens - 1) chunk_end = min(chunk_end, num_tokens) - with ctx.dp_metadata.chunked_sizes( + chunk_sizes = ctx.dp_metadata.chunked_sizes( self.moe_config.sp_size, moe_dp_chunk_size_per_rank, chunk_idx - ): - process_chunk( - chunk_start, chunk_end, skip_result_store=chunk_start_ >= num_tokens + ) + with chunk_sizes: + hidden_states_chunk = self._slice_and_copy_input( + self.batched_hidden_states, + hidden_states, + chunk_start, + chunk_end, ) + router_logits_chunk = self._slice_and_copy_input( + self.batched_router_logits, + router_logits, + chunk_start, + chunk_end, + ) + + shared_input_chunk = shared_input[chunk_start:chunk_end, :] if shared_input is not None else None + + shared_output_chunk, hidden_states_chunk = self._apply_quant_method( + layer=layer, + hidden_states=hidden_states_chunk, + extra_tensor=None, + router_logits=router_logits_chunk, + shared_input=shared_input_chunk, + ) + + # Store outputs + # TODO(bnell): document when chunk_start >= num_tokens + if chunk_start < num_tokens: + final_fused_hidden_states[chunk_start:chunk_end, :].copy_( + hidden_states_chunk, non_blocking=True + ) + if self.shared_experts is not None: + assert shared_output_chunk is not None + assert final_shared_hidden_states is not None + final_shared_hidden_states[chunk_start:chunk_end, :].copy_( + shared_output_chunk, non_blocking=True + ) + if self.shared_experts is None: - return full_fused_final_hidden_states + return final_fused_hidden_states else: - return (full_shared_final_hidden_states, full_fused_final_hidden_states) + assert final_shared_hidden_states is not None + return (final_shared_hidden_states, final_fused_hidden_states) def forward_impl( self, @@ -597,148 +809,43 @@ def forward_impl( router_logits: torch.Tensor, shared_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert self.quant_method is not None - - self.ensure_dp_chunking_init() - - has_separate_shared_experts = ( - not self.quant_method.mk_owns_shared_expert - and self.shared_experts is not None - ) - - use_chunked_impl = self.use_dp_chunking - - use_shared_experts_stream, shared_experts_input = ( - self._maybe_setup_shared_experts_stream( - hidden_states, - shared_input, - has_separate_shared_experts, - use_chunked_impl, + # TODO(bnell): split this into runtime vs. static parts? + self.use_shared_experts_stream = ( + current_platform.is_cuda() + and self.has_separate_shared_experts + and not self.use_dp_chunking + and self.shared_experts_stream is not None + and ( + hidden_states.shape[0] + <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD ) ) - # If router/gate provided, then apply it here. - # (Note: This code runs only when "overlapped mode" is on to allow - # parallel execution of shared experts with the FusedMoE via - # separate cuda stream) - if self.gate is not None: - router_logits, _ = self.gate(hidden_states) - - if use_chunked_impl: - return self.forward_impl_chunked( - layer, - hidden_states, - router_logits, - shared_input, - has_separate_shared_experts, - ) - - # NOTE(rob): once we finish migrating all the quant methods to use - # MKs, we can remove the naive dispatch/combine path from here. - do_naive_dispatch_combine = ( - self.moe_config.dp_size > 1 and not self.quant_method.supports_internal_mk + # Check if we need to run shared experts before matrix multiply because + # matrix multiply may modify the hidden_states. + run_shared_experts_before = ( + self.has_separate_shared_experts and not self.use_shared_experts_stream ) - ctx = get_forward_context() - sp_ctx = ( - ctx.dp_metadata.sp_local_sizes(self.moe_config.sp_size) - if ctx.dp_metadata - else nullcontext() + # TODO(bnell): parts of the dispatch/combine steps will go away once + # #32567 lands and the remaining kernels are made MKs. The PCP + # code will probably remain + hidden_states, router_logits, extra_tensor = self._maybe_dispatch( + layer, + hidden_states, + router_logits, ) - with sp_ctx: - # Run shared experts before matrix multiply. - # because matrix multiply maybe modify the hidden_states. - if has_separate_shared_experts and not use_shared_experts_stream: - assert self.shared_experts is not None - shared_input = ( - shared_input if shared_input is not None else hidden_states - ) - shared_output = self.shared_experts(shared_input) - - # For naive dispatch/combine Dp/Ep, dispatch the hidden states and - # router logits to all experts. - # NOTE: this will be removed once all kernels are migrated into the - # MoEKernel framework. - if do_naive_dispatch_combine: - hidden_states, router_logits = get_ep_group().dispatch_router_logits( - hidden_states, - router_logits, - self.moe_config.is_sequence_parallel, - ) - - # NOTE: Similar with DP, PCP also needs dispatch and combine. For - # simplicity, AgRsAll2All was added separately for PCP here. Maybe - # we should modify All2AllManager abstract to better support PCP. - if self.moe_config.pcp_size > 1: - hidden_states = get_pcp_group().all_gather( - hidden_states, - dim=0, - ) - router_logits = get_pcp_group().all_gather( - router_logits, - dim=0, - ) - - # Matrix multiply. - if self.quant_method.is_monolithic: - final_hidden_states = self.quant_method.apply_monolithic( - layer=layer, - x=hidden_states, - router_logits=router_logits, - ) - else: - topk_weights, topk_ids = self.router.select_experts( - hidden_states=hidden_states, - router_logits=router_logits, - ) - - final_hidden_states = self.quant_method.apply( - layer=layer, - x=hidden_states, - topk_weights=topk_weights, - topk_ids=topk_ids, - shared_experts_input=shared_input, - ) - - if has_separate_shared_experts: - assert self.shared_experts is not None - - if use_shared_experts_stream: - # Run shared experts in parallel on a separate stream - # NOTE: We start the separate stream here and mark the - # sync end point immediately after it is done. This is - # important to avoid excessive stream allocations by the cuda - # graph replay later. - with torch.cuda.stream(self.shared_experts_stream): - # Note that hidden_states clone() is necessary here to avoid - # conflict with the main stream - shared_output = self.shared_experts(shared_experts_input) - current_stream().wait_stream(self.shared_experts_stream) - - final_hidden_states = ( - shared_output, - final_hidden_states, - ) - - def combine_output(states: torch.Tensor) -> torch.Tensor: - if do_naive_dispatch_combine: - states = get_ep_group().combine( - states, self.moe_config.is_sequence_parallel - ) - - if self.moe_config.pcp_size > 1: - states = get_pcp_group().reduce_scatter( - states, - dim=0, - ) - - return states + shared_output, hidden_states = self._apply_quant_method( + layer=layer, + hidden_states=hidden_states, + extra_tensor=extra_tensor, + router_logits=router_logits, + shared_input=shared_input, + run_shared_experts_before=run_shared_experts_before, + ) - if self.shared_experts is not None: - return ( - final_hidden_states[0], - combine_output(final_hidden_states[1]), - ) - else: - return combine_output(final_hidden_states) + return self._maybe_combine( + shared_output, + hidden_states, + ) From a4d3acb8469a4355aae6eb89a5146f46caf7c4ef Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 12 Feb 2026 18:03:08 -0500 Subject: [PATCH 002/191] fix lint Signed-off-by: Bill Nell --- .../layers/fused_moe/runner/default_moe_runner.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index bb1ac465b1c5..c7de96e018e4 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -773,7 +773,11 @@ def forward_impl_chunked( chunk_end, ) - shared_input_chunk = shared_input[chunk_start:chunk_end, :] if shared_input is not None else None + shared_input_chunk = ( + shared_input[chunk_start:chunk_end, :] + if shared_input is not None + else None + ) shared_output_chunk, hidden_states_chunk = self._apply_quant_method( layer=layer, From 5b7f133cab46c2a8e7bad5424f8e54a4671c5c4a Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 24 Feb 2026 13:24:40 -0500 Subject: [PATCH 003/191] rebase Signed-off-by: Bill Nell --- .../fused_moe/runner/default_moe_runner.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index c7de96e018e4..5a204bd67ec8 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -261,7 +261,6 @@ def _select_forward(self, layer: torch.nn.Module) -> Callable: else torch.ops.vllm.moe_forward_shared ) - # TODO(bnell): make this a member var? @property def use_dp_chunking(self) -> bool: return ( @@ -276,7 +275,7 @@ def _maybe_setup_shared_experts_stream( hidden_states: torch.Tensor, shared_input: torch.Tensor | None, ) -> torch.Tensor | None: - hidden_states_clone: torch.Tensor | None = None + shared_experts_input: torch.Tensor | None = None if self.use_shared_experts_stream: assert self.shared_experts_stream is not None assert self.moe_config.disable_inplace @@ -455,7 +454,11 @@ def _maybe_pad_hidden_states( original_hidden_states: torch.Tensor | None, hidden_states: torch.Tensor, ) -> tuple[torch.Tensor, list[int]]: - original_hidden_dim = original_hidden_states.shape[-1] if original_hidden_states is not None else 0 + original_hidden_dim = ( + original_hidden_states.shape[-1] + if original_hidden_states is not None + else 0 + ) transformed_hidden_dim = hidden_states.shape[-1] if ( not self.quant_method.skip_forward_padding @@ -681,7 +684,10 @@ def forward( ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: # For latent MoE: save ORIGINAL hidden_states before transform # (shared_experts need original dimension, routed experts use transformed) - original_hidden_states = hidden_states + if self.shared_experts is not None: + original_hidden_states = hidden_states + else: + original_hidden_states = None # Apply transform for routed experts (e.g., latent projection for latent MoE) hidden_states = self.apply_routed_input_transform(hidden_states) @@ -700,7 +706,6 @@ def forward( return self._maybe_reduce_output(fused_output, og_hidden_dims) - # TODO: avoid some of the copying by disabling inplace? def _slice_and_copy_input( self, out_slice: torch.Tensor, @@ -813,7 +818,6 @@ def forward_impl( router_logits: torch.Tensor, shared_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - # TODO(bnell): split this into runtime vs. static parts? self.use_shared_experts_stream = ( current_platform.is_cuda() and self.has_separate_shared_experts From fad7f33991fc9209659cd3789a0ae22554efb12c Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 5 Mar 2026 16:03:53 -0500 Subject: [PATCH 004/191] rebase + remove dead code Signed-off-by: Bill Nell --- .../fused_moe/runner/default_moe_runner.py | 53 ++++--------------- 1 file changed, 11 insertions(+), 42 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index 5a204bd67ec8..15ccd47667a5 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -482,7 +482,6 @@ def _apply_quant_method( self, layer: torch.nn.Module, hidden_states: torch.Tensor, - extra_tensor: torch.Tensor | None, router_logits: torch.Tensor, shared_input: torch.Tensor | None, run_shared_experts_before: bool = True, @@ -502,14 +501,10 @@ def _apply_quant_method( shared_input, ) - # TODO(bnell): deal with fp4 flashinfer tuple hidden states hack (#30014). - # Figure out nicer way to do this. - x_arg = hidden_states if extra_tensor is None else (hidden_states, extra_tensor) - if self.quant_method.is_monolithic: result = self.quant_method.apply_monolithic( layer=layer, - x=x_arg, + x=hidden_states, router_logits=router_logits, ) else: @@ -520,7 +515,7 @@ def _apply_quant_method( result = self.quant_method.apply( layer=layer, - x=x_arg, + x=hidden_states, topk_weights=topk_weights, topk_ids=topk_ids, shared_experts_input=shared_input, @@ -603,45 +598,21 @@ def _maybe_dispatch( layer: torch.nn.Module, hidden_states: torch.Tensor, router_logits: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: - extra_tensor: torch.Tensor | None = None - + ) -> tuple[torch.Tensor, torch.Tensor]: + # For naive dispatch/combine Dp/Ep, dispatch the hidden states and + # router logits to all experts. + # NOTE: this will be removed once all kernels are migrated into the + # MoEKernel framework. if self.do_naive_dispatch_combine: - post_quant_allgather = ( - self.moe_config.dp_size > 1 - and self.moe_config.use_ep - and getattr(self.quant_method, "do_post_quant_allgather", False) - ) - - extra_tensors: list[torch.Tensor] | None = None - - if post_quant_allgather: - hidden_states_to_dispatch, extra_tensors = ( - self.quant_method.prepare_dp_allgather_tensor( - layer, hidden_states, router_logits - ) - ) - else: - hidden_states_to_dispatch = hidden_states - - result = get_ep_group().dispatch_router_logits( - hidden_states_to_dispatch, + hidden_states, router_logits = get_ep_group().dispatch_router_logits( + hidden_states, router_logits, self.moe_config.is_sequence_parallel, - extra_tensors=extra_tensors, ) - if len(result) == 3: - hidden_states, router_logits, extra_tensors = result - assert isinstance(extra_tensors, list) and len(extra_tensors) == 1 - extra_tensor = extra_tensors[0] - else: - hidden_states, router_logits = result - # NOTE: Similar with DP, PCP also needs dispatch and combine. For # simplicity, AgRsAll2All was added separately for PCP here. Maybe # we should modify All2AllManager abstraction to better support PCP. - # TODO(bnell): see what we can do here if self.moe_config.pcp_size > 1: hidden_states = get_pcp_group().all_gather( hidden_states, @@ -652,7 +623,7 @@ def _maybe_dispatch( dim=0, ) - return hidden_states, router_logits, extra_tensor + return hidden_states, router_logits def _maybe_combine( self, @@ -787,7 +758,6 @@ def forward_impl_chunked( shared_output_chunk, hidden_states_chunk = self._apply_quant_method( layer=layer, hidden_states=hidden_states_chunk, - extra_tensor=None, router_logits=router_logits_chunk, shared_input=shared_input_chunk, ) @@ -838,7 +808,7 @@ def forward_impl( # TODO(bnell): parts of the dispatch/combine steps will go away once # #32567 lands and the remaining kernels are made MKs. The PCP # code will probably remain - hidden_states, router_logits, extra_tensor = self._maybe_dispatch( + hidden_states, router_logits = self._maybe_dispatch( layer, hidden_states, router_logits, @@ -847,7 +817,6 @@ def forward_impl( shared_output, hidden_states = self._apply_quant_method( layer=layer, hidden_states=hidden_states, - extra_tensor=extra_tensor, router_logits=router_logits, shared_input=shared_input, run_shared_experts_before=run_shared_experts_before, From ec88db3d2aa23924d698e53b3f904a0834a7b809 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 19 Mar 2026 16:44:23 +0000 Subject: [PATCH 005/191] fix gate overlap Signed-off-by: Bill Nell --- .../fused_moe/runner/default_moe_runner.py | 38 +++++++++---------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index 15ccd47667a5..a09273fc8049 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -84,7 +84,6 @@ def _moe_forward( # TODO(bnell): this can be removed after MK migration is complete. layer.ensure_moe_quant_config_init() runner = layer.runner - router_logits = runner._maybe_gate(hidden_states, router_logits) with runner._sequence_parallel_context(): if runner.use_dp_chunking: return runner.forward_impl_chunked( @@ -121,7 +120,6 @@ def _moe_forward_shared( # TODO(bnell): this can be removed after MK migration is complete. layer.ensure_moe_quant_config_init() runner = layer.runner - router_logits = runner._maybe_gate(hidden_states, router_logits) with runner._sequence_parallel_context(): if runner.use_dp_chunking: return runner.forward_impl_chunked( @@ -274,8 +272,7 @@ def _maybe_setup_shared_experts_stream( self, hidden_states: torch.Tensor, shared_input: torch.Tensor | None, - ) -> torch.Tensor | None: - shared_experts_input: torch.Tensor | None = None + ): if self.use_shared_experts_stream: assert self.shared_experts_stream is not None assert self.moe_config.disable_inplace @@ -298,8 +295,6 @@ def _maybe_setup_shared_experts_stream( assert self.shared_experts_stream is not None self.shared_experts_stream.wait_stream(current_stream()) - return shared_experts_input - def _maybe_init_dp_chunking(self): if not self.use_dp_chunking: return @@ -486,20 +481,12 @@ def _apply_quant_method( shared_input: torch.Tensor | None, run_shared_experts_before: bool = True, ) -> tuple[torch.Tensor | None, torch.Tensor]: + shared_input = shared_input if shared_input is not None else hidden_states shared_output: torch.Tensor | None = None # Run this before quant_method to avoid inplace issues. if run_shared_experts_before: - shared_input = shared_input if shared_input is not None else hidden_states - shared_output = self._apply_shared_experts( - shared_input, - False, - ) - else: - hidden_states_clone = self._maybe_setup_shared_experts_stream( - hidden_states, - shared_input, - ) + shared_output = self._apply_shared_experts(shared_input, False) if self.quant_method.is_monolithic: result = self.quant_method.apply_monolithic( @@ -529,10 +516,7 @@ def _apply_quant_method( if not run_shared_experts_before and self.has_separate_shared_experts: assert shared_output is None - shared_output = self._apply_shared_experts( - hidden_states_clone, - True, - ) + shared_output = self._apply_shared_experts(shared_input, True) return shared_output, hidden_states @@ -704,6 +688,10 @@ def forward_impl_chunked( router_logits: torch.Tensor, shared_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + # Gate overlap not supported when chunking is enabled. Run the + # gate first. + router_logits = self._maybe_gate(hidden_states, router_logits) + final_shared_hidden_states, final_fused_hidden_states = ( self._allocate_dp_chunking_outputs(hidden_states, router_logits) ) @@ -805,6 +793,16 @@ def forward_impl( self.has_separate_shared_experts and not self.use_shared_experts_stream ) + # The shared experts stream must be set up before calling the gate so they + # can be overlapped. + if not run_shared_experts_before: + self._maybe_setup_shared_experts_stream( + hidden_states, + shared_input, + ) + + router_logits = self._maybe_gate(hidden_states, router_logits) + # TODO(bnell): parts of the dispatch/combine steps will go away once # #32567 lands and the remaining kernels are made MKs. The PCP # code will probably remain From 76aff0a040ab96f92d144c8aea1006c4a2ec5656 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 4 Feb 2026 10:36:16 -0500 Subject: [PATCH 006/191] wip Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 13 ++-- .../fused_moe/runner/default_moe_runner.py | 64 ++++++++++++++++++- .../layers/fused_moe/shared_fused_moe.py | 41 ++---------- 3 files changed, 74 insertions(+), 44 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 2f704569209c..bf6816fc611e 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -329,6 +329,7 @@ def __init__( is_sequence_parallel=False, expert_mapping: list[tuple[str, str, int, str]] | None = None, n_shared_experts: int | None = None, + shared_experts: torch.nn.Module | None = None, router_logits_dtype: torch.dtype | None = None, gate: torch.nn.Module | None = None, shared_experts: torch.nn.Module | None = None, @@ -661,6 +662,7 @@ def _init_runner(self): quant_method=self.quant_method, reduce_results=self.reduce_results, enable_dbo=self.vllm_config.parallel_config.enable_dbo, + enable_eplb=self.enable_eplb, ) # TODO(bnell): This method is provided as a hook so vllm/lora/layers/fused_moe.py @@ -708,12 +710,13 @@ def maybe_init_modular_kernel(self) -> None: def shared_experts(self) -> torch.nn.Module | None: return self._shared_experts if self.use_overlapped else None - @property - def layer_id(self): - # Delayed import to avoid circular dependency - from vllm.model_executor.models.utils import extract_layer_index + # TODO(bnell): is this needed? + #@property + #def layer_id(self): + # # Delayed import to avoid circular dependency + # from vllm.model_executor.models.utils import extract_layer_index - return extract_layer_index(self.layer_name) + # return extract_layer_index(self.layer_name) @property def gate(self) -> torch.nn.Module | None: diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index a09273fc8049..9703729d93cf 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -11,6 +11,7 @@ from vllm.distributed import ( get_ep_group, get_pcp_group, + get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) from vllm.forward_context import ( @@ -137,6 +138,42 @@ def _moe_forward_shared( ) +def _moe_forward_shared_new( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + layer_name: str, +) -> tuple[torch.Tensor, torch.Tensor]: + def reduce_shared_out(shared_out: torch.Tensor) -> torch.Tensor: + # Reduce shared expert outputs if necessary, since the MLP + # should have been created with reduce_results=False. + if ( + self.reduce_results + and get_tensor_model_parallel_world_size() > 1 + and self.must_reduce_shared_expert_outputs() + ): + shared_out = tensor_model_parallel_all_reduce(shared_out) + return shared_out + + if False and not self.overlap_shared_experts: + shared_out = self._shared_experts(hidden_states) + # XXXXXXXXXXXx + shared_experts = None + + router_logits = self._maybe_gate(hidden_states, router_logits) + with self._sequence_parallel_context(): + if self.use_dp_chunking: + shared_out, fused_out = self.forward_impl_chunked( + layer, hidden_states, router_logits + ) + else: + shared_out = fused_out = self.forward_impl( + layer, hidden_states, router_logits + ) + + return reduce_shared_out(shared_out), fused_out + + def _moe_forward_shared_fake( hidden_states: torch.Tensor, router_logits: torch.Tensor, @@ -207,6 +244,7 @@ def __init__( quant_method: FusedMoEMethodBase, reduce_results: bool, enable_dbo: bool, + enable_eplb: bool, ): super().__init__() self.moe_config = moe_config @@ -217,9 +255,12 @@ def __init__( self.quant_method = quant_method self.reduce_results = reduce_results self.enable_dbo = enable_dbo + self.enable_eplb = enable_eplb # Chunked all2all staging tensor - # TODO(bnell) rename these? + # TODO rename these + # These need to exist ahead of time due to CUDAgraph construction + # needing a fixed buffer address. self.batched_hidden_states: torch.Tensor | None = None self.batched_router_logits: torch.Tensor | None = None self._maybe_init_dp_chunking() @@ -268,6 +309,22 @@ def use_dp_chunking(self) -> bool: or self.moe_config.moe_parallel_config.use_nixl_ep_kernels ) and envs.VLLM_ENABLE_MOE_DP_CHUNK + # TODO(bnell): better name + @property + def overlap_shared_experts( + self, + shared_experts: torch.nn.Module | None, + ) -> bool: + # Disable shared expert overlap if: + # - we are using eplb with non-default backend, because of correctness issues + # - we are using flashinfer with DP, since there nothing to gain + # - we are using marlin kernels + backend = self.moe_config.moe_parallel_config.all2all_backend + return shared_experts is not None and not ( + (self.enable_eplb and backend != "allgather_reducescatter") + or self.moe_config.moe_parallel_config.use_fi_all2allv_kernels + ) + def _maybe_setup_shared_experts_stream( self, hidden_states: torch.Tensor, @@ -476,6 +533,7 @@ def _maybe_pad_hidden_states( def _apply_quant_method( self, layer: torch.nn.Module, + shared_experts: torch.nn.Module | None, hidden_states: torch.Tensor, router_logits: torch.Tensor, shared_input: torch.Tensor | None, @@ -514,7 +572,9 @@ def _apply_quant_method( else: hidden_states = result - if not run_shared_experts_before and self.has_separate_shared_experts: + if not run_shared_experts_before and self.has_separate_shared_experts( + shared_experts + ): assert shared_output is None shared_output = self._apply_shared_experts(shared_input, True) diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py index 37336df17561..251c05300331 100644 --- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py @@ -3,10 +3,6 @@ import torch -from vllm.distributed import ( - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce, -) from vllm.model_executor.layers.fused_moe.layer import FusedMoE @@ -23,36 +19,7 @@ def forward( hidden_states: torch.Tensor, router_logits: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: - if not self.use_overlapped: - if self._shared_experts is not None: - shared_out = self._shared_experts(hidden_states) - - # Reduce shared expert outputs if necessary, since the MLP - # should have been created with reduce_results=False. - if ( - self.reduce_results - and get_tensor_model_parallel_world_size() > 1 - and self.must_reduce_shared_expert_outputs() - ): - shared_out = tensor_model_parallel_all_reduce(shared_out) - else: - shared_out = None - - fused_out = super().forward( - hidden_states=hidden_states, - router_logits=router_logits, - ) - else: - shared_out, fused_out = super().forward( - hidden_states=hidden_states, - router_logits=router_logits, - ) - # ensure early TP reduction of shared expert outputs when required - if ( - shared_out is not None - and self.reduce_results - and get_tensor_model_parallel_world_size() > 1 - and self.must_reduce_shared_expert_outputs() - ): - shared_out = tensor_model_parallel_all_reduce(shared_out) - return shared_out, fused_out + return super().forward( + hidden_states=hidden_states, + router_logits=router_logits, + ) From 4fab915f53e026f3ca53267efd953f4c0a2b826c Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 9 Feb 2026 18:27:46 -0500 Subject: [PATCH 007/191] fix Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index bf6816fc611e..c992930c8859 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -329,7 +329,6 @@ def __init__( is_sequence_parallel=False, expert_mapping: list[tuple[str, str, int, str]] | None = None, n_shared_experts: int | None = None, - shared_experts: torch.nn.Module | None = None, router_logits_dtype: torch.dtype | None = None, gate: torch.nn.Module | None = None, shared_experts: torch.nn.Module | None = None, @@ -711,8 +710,8 @@ def shared_experts(self) -> torch.nn.Module | None: return self._shared_experts if self.use_overlapped else None # TODO(bnell): is this needed? - #@property - #def layer_id(self): + # @property + # def layer_id(self): # # Delayed import to avoid circular dependency # from vllm.model_executor.models.utils import extract_layer_index From d8a7f910a0f40b645a858bd22eeca85a11b03bb8 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 11 Feb 2026 14:36:30 -0500 Subject: [PATCH 008/191] WIP DOUBLE CHECK THIS Signed-off-by: Bill Nell --- .../fused_moe/router/fused_moe_router.py | 8 +++ .../fused_moe/router/memoizing_router.py | 35 +++++++++++++ .../fused_moe/runner/default_moe_runner.py | 50 +++++++++---------- .../layers/fused_moe/shared_fused_moe.py | 2 +- 4 files changed, 69 insertions(+), 26 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/router/memoizing_router.py diff --git a/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py b/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py index c322a8cd4cd6..d7aed4fdeb2b 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod +from collections.abc import Callable import torch @@ -13,6 +14,13 @@ class FusedMoERouter(ABC): method that is used for routing hidden states based on router logits. """ + @abstractmethod + def set_capture_fn( + self, + capture_fn: Callable[[torch.Tensor], None] | None, + ) -> None: + raise NotImplementedError + @property @abstractmethod def routing_method_type(self) -> RoutingMethodType: diff --git a/vllm/model_executor/layers/fused_moe/router/memoizing_router.py b/vllm/model_executor/layers/fused_moe/router/memoizing_router.py new file mode 100644 index 000000000000..a55bd2f09d6f --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/router/memoizing_router.py @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable + +import torch + +from vllm.model_executor.layers.fused_moe.config import ( + RoutingMethodType, +) +from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter + + +class MemoizingRouter(FusedMoERouter): + def __init__(self, router: FusedMoERouter): + self.router = router + + def set_capture_fn( + self, + capture_fn: Callable[[torch.Tensor], None] | None, + ) -> None: + self.router.set_capture_fn(capture_fn) + self.results: tuple[torch.Tensor, torch.Tensor] | None = None + + @property + def routing_method_type(self) -> RoutingMethodType: + return self.router.routing_method_type + + def select_experts( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + if self.results is None: + self.results = self.router.select_experts(hidden_states, router_logits) + return self.results diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index 9703729d93cf..d7f3284ee197 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -144,30 +144,33 @@ def _moe_forward_shared_new( shared_experts_input: torch.Tensor | None, layer_name: str, ) -> tuple[torch.Tensor, torch.Tensor]: + layer = get_layer_from_name(layer_name) + runner = layer.runner + def reduce_shared_out(shared_out: torch.Tensor) -> torch.Tensor: # Reduce shared expert outputs if necessary, since the MLP # should have been created with reduce_results=False. if ( - self.reduce_results + runner.reduce_results and get_tensor_model_parallel_world_size() > 1 - and self.must_reduce_shared_expert_outputs() + and runner.must_reduce_shared_expert_outputs() ): shared_out = tensor_model_parallel_all_reduce(shared_out) return shared_out - if False and not self.overlap_shared_experts: - shared_out = self._shared_experts(hidden_states) - # XXXXXXXXXXXx + if not runner.overlap_shared_experts: + shared_out = runner._shared_experts(hidden_states) + # XXXXXXXXXXXXXXXX shared_experts = None - router_logits = self._maybe_gate(hidden_states, router_logits) - with self._sequence_parallel_context(): - if self.use_dp_chunking: - shared_out, fused_out = self.forward_impl_chunked( + router_logits = runner._maybe_gate(hidden_states, router_logits) + with runner._sequence_parallel_context(): + if runner.use_dp_chunking: + shared_out, fused_out = runner.forward_impl_chunked( layer, hidden_states, router_logits ) else: - shared_out = fused_out = self.forward_impl( + shared_out = fused_out = runner.forward_impl( layer, hidden_states, router_logits ) @@ -282,6 +285,13 @@ def __init__( "Enabled separate cuda stream for MoE shared_experts", scope="local" ) + self.allow_shared_experts_stream = ( + current_platform.is_cuda() + and self.has_separate_shared_experts + and not self.use_dp_chunking + and self.shared_experts_stream is not None + ) + # Needed for string -> FusedMoE layer lookup in custom ops. self.layer_name = layer.layer_name @@ -313,14 +323,13 @@ def use_dp_chunking(self) -> bool: @property def overlap_shared_experts( self, - shared_experts: torch.nn.Module | None, ) -> bool: # Disable shared expert overlap if: # - we are using eplb with non-default backend, because of correctness issues # - we are using flashinfer with DP, since there nothing to gain # - we are using marlin kernels backend = self.moe_config.moe_parallel_config.all2all_backend - return shared_experts is not None and not ( + return self.shared_experts is not None and not ( (self.enable_eplb and backend != "allgather_reducescatter") or self.moe_config.moe_parallel_config.use_fi_all2allv_kernels ) @@ -533,7 +542,6 @@ def _maybe_pad_hidden_states( def _apply_quant_method( self, layer: torch.nn.Module, - shared_experts: torch.nn.Module | None, hidden_states: torch.Tensor, router_logits: torch.Tensor, shared_input: torch.Tensor | None, @@ -572,9 +580,7 @@ def _apply_quant_method( else: hidden_states = result - if not run_shared_experts_before and self.has_separate_shared_experts( - shared_experts - ): + if not run_shared_experts_before and self.has_separate_shared_experts: assert shared_output is None shared_output = self._apply_shared_experts(shared_input, True) @@ -836,15 +842,9 @@ def forward_impl( router_logits: torch.Tensor, shared_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - self.use_shared_experts_stream = ( - current_platform.is_cuda() - and self.has_separate_shared_experts - and not self.use_dp_chunking - and self.shared_experts_stream is not None - and ( - hidden_states.shape[0] - <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD - ) + # TODO(bnell): split this into runtime vs. static parts? + self.use_shared_experts_stream = self.allow_shared_experts_stream and ( + hidden_states.shape[0] <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD ) # Check if we need to run shared experts before matrix multiply because diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py index 251c05300331..9aa9fe8e0a9a 100644 --- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py @@ -6,7 +6,7 @@ from vllm.model_executor.layers.fused_moe.layer import FusedMoE -# TODO(bnell): Add shared + fused combo function? e.g. + +# TODO(bnell): Remove this entirely class SharedFusedMoE(FusedMoE): """ A FusedMoE operation that also computes the results of shared experts. From 3dec78f98e40bc7a7d734cf832038ccbe7e8727d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 18 Feb 2026 19:00:11 -0500 Subject: [PATCH 009/191] wip more refactoring Signed-off-by: Bill Nell --- vllm/lora/layers/fused_moe.py | 7 +- vllm/model_executor/layers/fused_moe/layer.py | 34 +-- .../fused_moe/runner/default_moe_runner.py | 270 ++++++++++++++---- 3 files changed, 231 insertions(+), 80 deletions(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 78876ef7c9b0..5aa8a56d4115 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -150,6 +150,7 @@ def _inject_lora_into_fused_moe(self): self.base_layer.quant_method.select_gemm_impl( prepare_finalize, self.base_layer ), + self.base_layer.runner.get_shared_experts(), # XXXXXXXXXXXXXXXXXXXX ) if quant_config.use_mxfp4_w4a16: @@ -592,9 +593,9 @@ def forward(self, *args, **kwargs): def maybe_all_reduce_tensor_model_parallel(self, *args, **kwargs): return self.base_layer.maybe_all_reduce_tensor_model_parallel(*args, **kwargs) - @property - def _shared_experts(self): - return self.base_layer._shared_experts + # @property + # def _shared_experts(self): + # return self.base_layer._shared_experts @property def quant_method(self): diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index c992930c8859..c6c6dab9dd04 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -636,14 +636,14 @@ def _get_quant_method() -> FusedMoEMethodBase: # - we are using eplb with non-default backend, because of correctness issues # - we are using flashinfer with DP, since there nothing to gain # - we are using marlin kernels - backend = self.moe_parallel_config.all2all_backend - self.use_overlapped = ( - not ( - (self.enable_eplb and backend != "allgather_reducescatter") - or self.moe_parallel_config.use_fi_nvl_two_sided_kernels - ) - and self._shared_experts is not None - ) + # backend = self.moe_parallel_config.all2all_backend + # self.use_overlapped = ( + # not ( + # (self.enable_eplb and backend != "allgather_reducescatter") + # or self.moe_parallel_config.use_fi_all2allv_kernels + # ) + # and self._shared_experts is not None + # ) self.runner = self._init_runner() @@ -656,8 +656,8 @@ def _init_runner(self): moe_config=self.moe_config, router=self.router, routed_input_transform=self._routed_input_transform, - gate=self.gate, - shared_experts=self.shared_experts, + gate=self._gate, + shared_experts=self._shared_experts, quant_method=self.quant_method, reduce_results=self.reduce_results, enable_dbo=self.vllm_config.parallel_config.enable_dbo, @@ -700,14 +700,14 @@ def maybe_init_modular_kernel(self) -> None: self, self.base_quant_method, prepare_finalize, - self.shared_experts, + self.runner.get_shared_experts(), # XXXXXXXXXXXXXXXXX inplace=not self.moe_config.disable_inplace, ) ) - @property - def shared_experts(self) -> torch.nn.Module | None: - return self._shared_experts if self.use_overlapped else None + # @property + # def shared_experts(self) -> torch.nn.Module | None: + # return self._shared_experts if self.use_overlapped else None # TODO(bnell): is this needed? # @property @@ -717,9 +717,9 @@ def shared_experts(self) -> torch.nn.Module | None: # return extract_layer_index(self.layer_name) - @property - def gate(self) -> torch.nn.Module | None: - return self._gate if self.use_overlapped else None + # @property + # def gate(self) -> torch.nn.Module | None: + # return self._gate if self.use_overlapped else None @property def tp_size(self): diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index d7f3284ee197..3bfefcb442fe 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -3,6 +3,7 @@ from collections.abc import Callable from contextlib import nullcontext from typing import TYPE_CHECKING +from enum import IntEnum import torch import torch.nn.functional as F @@ -44,6 +45,23 @@ logger = init_logger(__name__) +class SharedExpertsOrder(IntEnum): + # No shared experts. + NONE = (0,) + + # Get rid of this one? combine with BEFORE? + EXTERNAL = (1,) + + # Called by modular kernel. + INTERNAL = (2,) + + # Called right before quant_method is executed. + BEFORE_QUANT_METHOD = (3,) + + # Called right after quant_method is executed (possibly with streaming). + AFTER_QUANT_METHOD = (4,) + + def get_layer_from_name(layer_name: str) -> torch.nn.Module: forward_context: ForwardContext = get_forward_context() if layer_name == "from_forward_context": @@ -81,6 +99,7 @@ def _moe_forward( shared_experts_input: torch.Tensor | None, layer_name: _layer_name_type, ) -> torch.Tensor: +<<<<<<< HEAD layer = get_layer_from_name(_resolve_layer_name(layer_name)) # TODO(bnell): this can be removed after MK migration is complete. layer.ensure_moe_quant_config_init() @@ -100,6 +119,15 @@ def _moe_forward( router_logits, shared_experts_input, ) +======= + layer = get_layer_from_name(layer_name) + return layer.runner.forward_dispatch( + layer, + hidden_states, + router_logits, + shared_experts_input, + ) +>>>>>>> d11342756e (wip more refactoring) def _moe_forward_fake( @@ -117,6 +145,7 @@ def _moe_forward_shared( shared_experts_input: torch.Tensor | None, layer_name: _layer_name_type, ) -> tuple[torch.Tensor, torch.Tensor]: +<<<<<<< HEAD layer = get_layer_from_name(_resolve_layer_name(layer_name)) # TODO(bnell): this can be removed after MK migration is complete. layer.ensure_moe_quant_config_init() @@ -175,6 +204,15 @@ def reduce_shared_out(shared_out: torch.Tensor) -> torch.Tensor: ) return reduce_shared_out(shared_out), fused_out +======= + layer = get_layer_from_name(layer_name) + return layer.runner.forward_dispatch( + layer, + hidden_states, + router_logits, + shared_experts_input, + ) +>>>>>>> d11342756e (wip more refactoring) def _moe_forward_shared_fake( @@ -264,6 +302,7 @@ def __init__( # TODO rename these # These need to exist ahead of time due to CUDAgraph construction # needing a fixed buffer address. + # TODO: these could be global, i.e. shared by all layers self.batched_hidden_states: torch.Tensor | None = None self.batched_router_logits: torch.Tensor | None = None self._maybe_init_dp_chunking() @@ -285,17 +324,69 @@ def __init__( "Enabled separate cuda stream for MoE shared_experts", scope="local" ) - self.allow_shared_experts_stream = ( + self.use_dp_chunking = ( + self.moe_config.moe_parallel_config.use_pplx_kernels + or self.moe_config.moe_parallel_config.use_deepep_ll_kernels + or self.moe_config.moe_parallel_config.use_mori_kernels + or self.moe_config.moe_parallel_config.use_fi_all2allv_kernels + ) and envs.VLLM_ENABLE_MOE_DP_CHUNK + + # Needed for string -> FusedMoE layer lookup in custom ops. + self.layer_name = layer.layer_name + + self.moe_forward = self._select_forward(layer) + + def _has_external_experts(self) -> bool: + # Disable shared expert overlap if: + # - we are using eplb with non-default backend, because of correctness issues + # - we are using flashinfer with DP, since there nothing to gain + backend = self.moe_config.moe_parallel_config.all2all_backend + return self.shared_experts is not None and not ( + (self.enable_eplb and backend != "allgather_reducescatter") + or self.moe_config.moe_parallel_config.use_fi_all2allv_kernels + ) + + def _determine_shared_experts_order( + self, + hidden_states: torch.Tensor, + ) -> tuple[SharedExpertsOrder, bool]: + if self.shared_experts is None: + return SharedExpertsOrder.NONE, False + + if self._has_external_experts(): + return SharedExpertsOrder.EXTERNAL, False + + has_separate_shared_experts = not self.quant_method.mk_owns_shared_expert + + if not has_separate_shared_experts: + return SharedExpertsOrder.INTERNAL, False + + allow_shared_experts_stream = ( current_platform.is_cuda() - and self.has_separate_shared_experts + and has_separate_shared_experts and not self.use_dp_chunking and self.shared_experts_stream is not None + and hidden_states.shape[0] + <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD ) - # Needed for string -> FusedMoE layer lookup in custom ops. - self.layer_name = layer.layer_name + # Check if we need to run shared experts before matrix multiply because + # matrix multiply may modify the hidden_states. + run_shared_experts_before = ( + has_separate_shared_experts and not allow_shared_experts_stream + ) - self.moe_forward = self._select_forward(layer) + if run_shared_experts_before: + return SharedExpertsOrder.BEFORE_QUANT_METHOD, False + else: + return SharedExpertsOrder.AFTER_QUANT_METHOD, allow_shared_experts_stream + + # XXXXXX used by layer.py and lora/layers/fused_moe.py + def get_shared_experts(self) -> torch.nn.Module | None: + if self._has_external_experts(): + return self.shared_experts + else: + return None def _select_forward(self, layer: torch.nn.Module) -> Callable: if current_platform.is_tpu() or current_platform.is_cpu(): @@ -310,6 +401,7 @@ def _select_forward(self, layer: torch.nn.Module) -> Callable: else torch.ops.vllm.moe_forward_shared ) +<<<<<<< HEAD @property def use_dp_chunking(self) -> bool: return ( @@ -391,35 +483,39 @@ def _maybe_init_dp_chunking(self): device=device, ) - @property - def has_separate_shared_experts(self) -> bool: - return ( - not self.quant_method.mk_owns_shared_expert - and self.shared_experts is not None - ) - - def _apply_shared_experts( + def _maybe_apply_shared_experts( self, + shared_output: torch.Tensor | None, hidden_states: torch.Tensor, - allow_streaming: bool = False, + shared_input: torch.Tensor | None, + order: SharedExpertsOrder, ) -> torch.Tensor | None: - shared_output: torch.Tensor | None = None - if self.has_separate_shared_experts: - assert self.shared_experts is not None - - if self.use_shared_experts_stream and allow_streaming: - # Run shared experts in parallel on a separate stream - # NOTE: We start the separate stream here and mark the - # sync end point immediately after it is done. This is - # important to avoid excessive stream allocations by the cuda - # graph replay later. - with torch.cuda.stream(self.shared_experts_stream): - # Note that hidden_states clone() is necessary here to avoid - # conflict with the main stream - shared_output = self.shared_experts(hidden_states) - current_stream().wait_stream(self.shared_experts_stream) - else: + experts_order, use_shared_experts_stream = self._determine_shared_experts_order( + hidden_states + ) + if order != experts_order: + return None + + assert self.shared_experts is not None + assert shared_output is None + if order == SharedExpertsOrder.AFTER_QUANT_METHOD and use_shared_experts_stream: + hidden_states = self._maybe_setup_shared_experts_stream( + hidden_states, + shared_input, + ) + + # Run shared experts in parallel on a separate stream + # NOTE: We start the separate stream here and mark the + # sync end point immediately after it is done. This is + # important to avoid excessive stream allocations by the cuda + # graph replay later. + with torch.cuda.stream(self.shared_experts_stream): + # Note that hidden_states clone() is necessary here to avoid + # conflict with the main stream shared_output = self.shared_experts(hidden_states) + current_stream().wait_stream(self.shared_experts_stream) + else: + shared_output = self.shared_experts(hidden_states) return shared_output @@ -470,6 +566,18 @@ def apply_routed_input_transform(self, hidden_states: torch.Tensor) -> torch.Ten return result return hidden_states + # TODO: combine with runner.forward_impl_X + def _maybe_reduce_shared_out(self, shared_out: torch.Tensor) -> torch.Tensor: + # Reduce shared expert outputs if necessary, since the MLP + # should have been created with reduce_results=False. + if ( + self.reduce_results + and get_tensor_model_parallel_world_size() > 1 + and self.must_reduce_shared_expert_outputs() + ): + shared_out = tensor_model_parallel_all_reduce(shared_out) + return shared_out + def _maybe_reduce_output( self, states: torch.Tensor | tuple[torch.Tensor, torch.Tensor], @@ -518,7 +626,7 @@ def _maybe_pad_hidden_states( original_hidden_dim = ( original_hidden_states.shape[-1] if original_hidden_states is not None - else 0 + else hidden_states.shape[-1] ) transformed_hidden_dim = hidden_states.shape[-1] if ( @@ -542,20 +650,22 @@ def _maybe_pad_hidden_states( def _apply_quant_method( self, layer: torch.nn.Module, + shared_output: torch.Tensor | None, hidden_states: torch.Tensor, router_logits: torch.Tensor, shared_input: torch.Tensor | None, - run_shared_experts_before: bool = True, ) -> tuple[torch.Tensor | None, torch.Tensor]: - shared_input = shared_input if shared_input is not None else hidden_states - shared_output: torch.Tensor | None = None - # Run this before quant_method to avoid inplace issues. - if run_shared_experts_before: - shared_output = self._apply_shared_experts(shared_input, False) + shared_input = shared_input if shared_input is not None else hidden_states + shared_output = self._maybe_apply_shared_experts( + shared_output, + shared_input, + shared_input, + SharedExpertsOrder.BEFORE_QUANT_METHOD, + ) if self.quant_method.is_monolithic: - result = self.quant_method.apply_monolithic( + fused_out = self.quant_method.apply_monolithic( layer=layer, x=hidden_states, router_logits=router_logits, @@ -566,7 +676,7 @@ def _apply_quant_method( router_logits=router_logits, ) - result = self.quant_method.apply( + fused_out = self.quant_method.apply( layer=layer, x=hidden_states, topk_weights=topk_weights, @@ -574,15 +684,23 @@ def _apply_quant_method( shared_experts_input=shared_input, ) - if isinstance(result, tuple): - assert shared_output is None - shared_output, hidden_states = result + # TODO: try to clean this up + if isinstance(fused_out, tuple): + if fused_out[0] is None: + assert shared_output is not None + hidden_states = fused_out[1] + else: + assert shared_output is None + shared_output, hidden_states = fused_out else: - hidden_states = result + hidden_states = fused_out - if not run_shared_experts_before and self.has_separate_shared_experts: - assert shared_output is None - shared_output = self._apply_shared_experts(shared_input, True) + shared_output = self._maybe_apply_shared_experts( + shared_output, + shared_input, + shared_input, + SharedExpertsOrder.AFTER_QUANT_METHOD, + ) return shared_output, hidden_states @@ -705,10 +823,10 @@ def forward( ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: # For latent MoE: save ORIGINAL hidden_states before transform # (shared_experts need original dimension, routed experts use transformed) - if self.shared_experts is not None: - original_hidden_states = hidden_states - else: - original_hidden_states = None + original_hidden_states = ( + # or shared_experts is not None? + hidden_states if self.routed_input_transform is not None else None + ) # Apply transform for routed experts (e.g., latent projection for latent MoE) hidden_states = self.apply_routed_input_transform(hidden_states) @@ -718,6 +836,7 @@ def forward( hidden_states, ) + # Note: moe_forward will call forward_dispatch fused_output = self.moe_forward( hidden_states, router_logits, @@ -725,8 +844,42 @@ def forward( self._encode_layer_name(), ) + # TODO: figure out how to combine this with maybe_reduce_output + if self.shared_experts is not None: + fused_output = ( + self._maybe_reduce_shared_out(fused_output[0]), + fused_output[1], + ) + return self._maybe_reduce_output(fused_output, og_hidden_dims) + def forward_dispatch( + self, + layer: torch.nn.Module, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_input: torch.Tensor | None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + # TODO(bnell): this can be removed after MK migration is complete. + layer.ensure_moe_quant_config_init() + + router_logits = self._maybe_gate(hidden_states, router_logits) + with self._sequence_parallel_context(): + if self.use_dp_chunking: + return self.forward_impl_chunked( + layer, + hidden_states, + router_logits, + shared_input, + ) + else: + return self.forward_impl( + layer, + hidden_states, + router_logits, + shared_input, + ) + def _slice_and_copy_input( self, out_slice: torch.Tensor, @@ -811,6 +964,7 @@ def forward_impl_chunked( shared_output_chunk, hidden_states_chunk = self._apply_quant_method( layer=layer, + shared_output=None, hidden_states=hidden_states_chunk, router_logits=router_logits_chunk, shared_input=shared_input_chunk, @@ -842,15 +996,11 @@ def forward_impl( router_logits: torch.Tensor, shared_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - # TODO(bnell): split this into runtime vs. static parts? - self.use_shared_experts_stream = self.allow_shared_experts_stream and ( - hidden_states.shape[0] <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD - ) - - # Check if we need to run shared experts before matrix multiply because - # matrix multiply may modify the hidden_states. - run_shared_experts_before = ( - self.has_separate_shared_experts and not self.use_shared_experts_stream + shared_output = self._maybe_apply_shared_experts( + None, + hidden_states, + shared_input, + SharedExpertsOrder.EXTERNAL, ) # The shared experts stream must be set up before calling the gate so they @@ -874,10 +1024,10 @@ def forward_impl( shared_output, hidden_states = self._apply_quant_method( layer=layer, + shared_output=shared_output, hidden_states=hidden_states, router_logits=router_logits, shared_input=shared_input, - run_shared_experts_before=run_shared_experts_before, ) return self._maybe_combine( From e94b8632fbb467c8dc277da894c71ec180b4952a Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 18 Feb 2026 19:20:36 -0500 Subject: [PATCH 010/191] wip Signed-off-by: Bill Nell --- .../fused_moe/runner/default_moe_runner.py | 165 +++--------------- 1 file changed, 29 insertions(+), 136 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index 3bfefcb442fe..b0b59bbb0c1e 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -99,27 +99,6 @@ def _moe_forward( shared_experts_input: torch.Tensor | None, layer_name: _layer_name_type, ) -> torch.Tensor: -<<<<<<< HEAD - layer = get_layer_from_name(_resolve_layer_name(layer_name)) - # TODO(bnell): this can be removed after MK migration is complete. - layer.ensure_moe_quant_config_init() - runner = layer.runner - with runner._sequence_parallel_context(): - if runner.use_dp_chunking: - return runner.forward_impl_chunked( - layer, - hidden_states, - router_logits, - shared_experts_input, - ) - else: - return runner.forward_impl( - layer, - hidden_states, - router_logits, - shared_experts_input, - ) -======= layer = get_layer_from_name(layer_name) return layer.runner.forward_dispatch( layer, @@ -127,7 +106,6 @@ def _moe_forward( router_logits, shared_experts_input, ) ->>>>>>> d11342756e (wip more refactoring) def _moe_forward_fake( @@ -145,66 +123,6 @@ def _moe_forward_shared( shared_experts_input: torch.Tensor | None, layer_name: _layer_name_type, ) -> tuple[torch.Tensor, torch.Tensor]: -<<<<<<< HEAD - layer = get_layer_from_name(_resolve_layer_name(layer_name)) - # TODO(bnell): this can be removed after MK migration is complete. - layer.ensure_moe_quant_config_init() - runner = layer.runner - with runner._sequence_parallel_context(): - if runner.use_dp_chunking: - return runner.forward_impl_chunked( - layer, - hidden_states, - router_logits, - shared_experts_input, - ) - else: - return runner.forward_impl( - layer, - hidden_states, - router_logits, - shared_experts_input, - ) - - -def _moe_forward_shared_new( - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - layer_name: str, -) -> tuple[torch.Tensor, torch.Tensor]: - layer = get_layer_from_name(layer_name) - runner = layer.runner - - def reduce_shared_out(shared_out: torch.Tensor) -> torch.Tensor: - # Reduce shared expert outputs if necessary, since the MLP - # should have been created with reduce_results=False. - if ( - runner.reduce_results - and get_tensor_model_parallel_world_size() > 1 - and runner.must_reduce_shared_expert_outputs() - ): - shared_out = tensor_model_parallel_all_reduce(shared_out) - return shared_out - - if not runner.overlap_shared_experts: - shared_out = runner._shared_experts(hidden_states) - # XXXXXXXXXXXXXXXX - shared_experts = None - - router_logits = runner._maybe_gate(hidden_states, router_logits) - with runner._sequence_parallel_context(): - if runner.use_dp_chunking: - shared_out, fused_out = runner.forward_impl_chunked( - layer, hidden_states, router_logits - ) - else: - shared_out = fused_out = runner.forward_impl( - layer, hidden_states, router_logits - ) - - return reduce_shared_out(shared_out), fused_out -======= layer = get_layer_from_name(layer_name) return layer.runner.forward_dispatch( layer, @@ -212,7 +130,6 @@ def reduce_shared_out(shared_out: torch.Tensor) -> torch.Tensor: router_logits, shared_experts_input, ) ->>>>>>> d11342756e (wip more refactoring) def _moe_forward_shared_fake( @@ -328,7 +245,8 @@ def __init__( self.moe_config.moe_parallel_config.use_pplx_kernels or self.moe_config.moe_parallel_config.use_deepep_ll_kernels or self.moe_config.moe_parallel_config.use_mori_kernels - or self.moe_config.moe_parallel_config.use_fi_all2allv_kernels + or self.moe_config.moe_parallel_config.use_fi_nvl_two_sided_kernels + or self.moe_config.moe_parallel_config.use_nixl_ep_kernels ) and envs.VLLM_ENABLE_MOE_DP_CHUNK # Needed for string -> FusedMoE layer lookup in custom ops. @@ -401,32 +319,7 @@ def _select_forward(self, layer: torch.nn.Module) -> Callable: else torch.ops.vllm.moe_forward_shared ) -<<<<<<< HEAD - @property - def use_dp_chunking(self) -> bool: - return ( - self.moe_config.moe_parallel_config.use_deepep_ll_kernels - or self.moe_config.moe_parallel_config.use_mori_kernels - or self.moe_config.moe_parallel_config.use_fi_nvl_two_sided_kernels - or self.moe_config.moe_parallel_config.use_nixl_ep_kernels - ) and envs.VLLM_ENABLE_MOE_DP_CHUNK - - # TODO(bnell): better name - @property - def overlap_shared_experts( - self, - ) -> bool: - # Disable shared expert overlap if: - # - we are using eplb with non-default backend, because of correctness issues - # - we are using flashinfer with DP, since there nothing to gain - # - we are using marlin kernels - backend = self.moe_config.moe_parallel_config.all2all_backend - return self.shared_experts is not None and not ( - (self.enable_eplb and backend != "allgather_reducescatter") - or self.moe_config.moe_parallel_config.use_fi_all2allv_kernels - ) - - def _maybe_setup_shared_experts_stream( + def _setup_shared_experts_stream( self, hidden_states: torch.Tensor, shared_input: torch.Tensor | None, @@ -499,7 +392,8 @@ def _maybe_apply_shared_experts( assert self.shared_experts is not None assert shared_output is None if order == SharedExpertsOrder.AFTER_QUANT_METHOD and use_shared_experts_stream: - hidden_states = self._maybe_setup_shared_experts_stream( + # TODO: fold this in? + hidden_states = self._setup_shared_experts_stream( hidden_states, shared_input, ) @@ -517,6 +411,12 @@ def _maybe_apply_shared_experts( else: shared_output = self.shared_experts(hidden_states) + if order == SharedExpertsOrder.EXTERNAL: + # TODO: figure out how to combine this with maybe_reduce_output? + # or get rid of it completely..... + assert shared_output is not None + shared_output = self._maybe_reduce_shared_out(shared_output) + return shared_output def must_reduce_shared_expert_outputs(self) -> bool: @@ -816,6 +716,11 @@ def _maybe_combine( else: return hidden_states + # + # forward + # - self.moe_forward (_moe_forward or _moe_forward_shared) + # - forward_chunking_dispatch + # - forward_impl or forward_impl_chunked def forward( self, hidden_states: torch.Tensor, @@ -844,13 +749,6 @@ def forward( self._encode_layer_name(), ) - # TODO: figure out how to combine this with maybe_reduce_output - if self.shared_experts is not None: - fused_output = ( - self._maybe_reduce_shared_out(fused_output[0]), - fused_output[1], - ) - return self._maybe_reduce_output(fused_output, og_hidden_dims) def forward_dispatch( @@ -864,10 +762,19 @@ def forward_dispatch( layer.ensure_moe_quant_config_init() router_logits = self._maybe_gate(hidden_states, router_logits) + + shared_output = self._maybe_apply_shared_experts( + None, + hidden_states, + shared_input, + SharedExpertsOrder.EXTERNAL, + ) + with self._sequence_parallel_context(): if self.use_dp_chunking: return self.forward_impl_chunked( layer, + shared_output, hidden_states, router_logits, shared_input, @@ -875,6 +782,7 @@ def forward_dispatch( else: return self.forward_impl( layer, + shared_output, hidden_states, router_logits, shared_input, @@ -903,6 +811,7 @@ def _slice_and_copy_input( def forward_impl_chunked( self, layer: torch.nn.Module, + shared_output: torch.Tensor | None, hidden_states: torch.Tensor, router_logits: torch.Tensor, shared_input: torch.Tensor | None, @@ -964,7 +873,7 @@ def forward_impl_chunked( shared_output_chunk, hidden_states_chunk = self._apply_quant_method( layer=layer, - shared_output=None, + shared_output=shared_output, hidden_states=hidden_states_chunk, router_logits=router_logits_chunk, shared_input=shared_input_chunk, @@ -992,27 +901,11 @@ def forward_impl_chunked( def forward_impl( self, layer: torch.nn.Module, + shared_output: torch.Tensor | None, hidden_states: torch.Tensor, router_logits: torch.Tensor, shared_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - shared_output = self._maybe_apply_shared_experts( - None, - hidden_states, - shared_input, - SharedExpertsOrder.EXTERNAL, - ) - - # The shared experts stream must be set up before calling the gate so they - # can be overlapped. - if not run_shared_experts_before: - self._maybe_setup_shared_experts_stream( - hidden_states, - shared_input, - ) - - router_logits = self._maybe_gate(hidden_states, router_logits) - # TODO(bnell): parts of the dispatch/combine steps will go away once # #32567 lands and the remaining kernels are made MKs. The PCP # code will probably remain From 6cc50741324a3da05fd797605f7aa5055be82151 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 23 Feb 2026 16:49:41 -0500 Subject: [PATCH 011/191] SharedExperts wip Signed-off-by: Bill Nell --- vllm/lora/layers/fused_moe.py | 6 +- .../layers/fused_moe/cutlass_moe.py | 2 + .../layers/fused_moe/fused_moe_method_base.py | 4 +- .../fused_moe/fused_moe_modular_method.py | 10 +- vllm/model_executor/layers/fused_moe/layer.py | 65 ++-- .../layers/fused_moe/modular_kernel.py | 58 ++-- .../layers/fused_moe/oracle/fp8.py | 9 +- .../layers/fused_moe/oracle/nvfp4.py | 9 +- .../layers/fused_moe/oracle/unquantized.py | 4 + .../fused_moe/runner/default_moe_runner.py | 298 ++++-------------- .../layers/fused_moe/runner/shared_experts.py | 207 ++++++++++++ .../fused_moe/unquantized_fused_moe_method.py | 10 +- .../layers/quantization/awq_marlin.py | 2 +- .../layers/quantization/bitsandbytes.py | 2 +- .../compressed_tensors_moe.py | 20 +- .../layers/quantization/experts_int8.py | 2 +- .../model_executor/layers/quantization/fp8.py | 4 +- .../layers/quantization/gguf.py | 2 +- .../layers/quantization/gptq_marlin.py | 2 +- .../layers/quantization/modelopt.py | 8 +- .../layers/quantization/moe_wna16.py | 2 +- .../layers/quantization/mxfp4.py | 4 +- .../layers/quantization/quark/quark_moe.py | 6 +- 23 files changed, 391 insertions(+), 345 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/runner/shared_experts.py diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 5aa8a56d4115..b6c62b618a09 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -150,9 +150,11 @@ def _inject_lora_into_fused_moe(self): self.base_layer.quant_method.select_gemm_impl( prepare_finalize, self.base_layer ), - self.base_layer.runner.get_shared_experts(), # XXXXXXXXXXXXXXXXXXXX + self.base_layer.shared_experts, ) + # TODO: could be incorrect due to monolithic kernel? or add assert it + # is modular? if quant_config.use_mxfp4_w4a16: assert isinstance( m_fused_moe_fn.impl.fused_experts, @@ -170,6 +172,7 @@ def wrapper(*args, **kwargs): moe_state_dict["apply_router_weight_on_input"] = kwargs[ "apply_router_weight_on_input" ] + # TODO: global_num_experts/shared_experts_input? result = func(*args, **kwargs) return result @@ -339,6 +342,7 @@ def wrapper(*args, **kwargs): fused_experts = m_fused_moe_fn.impl.fused_experts + # TODO: seems like this could be done with modular kernel subclasses? m_fused_moe_fn.apply = fwd_decorator(self.base_layer, m_fused_moe_fn.apply) fused_experts.activation = act_decorator( self.base_layer, fused_experts.activation diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 75ee776646ba..43082b3675a6 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -1194,6 +1194,8 @@ def cutlass_moe_w4a8_fp8( quant_config=quant_config, group_size=group_size, ), + shared_experts=None, + inplace=False, ) return fn.apply( diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py index f6a303e7988e..9350a9c7de74 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py @@ -130,7 +130,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: raise NotImplementedError def apply_monolithic( @@ -138,5 +138,5 @@ def apply_monolithic( layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, router_logits: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: raise NotImplementedError diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index 0065c11f3163..142e180786c6 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -16,6 +16,9 @@ FusedMoEKernel, FusedMoEPrepareAndFinalizeModular, ) +from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( + SharedExperts, +) logger = init_logger(__name__) @@ -44,7 +47,7 @@ def make( moe_layer: torch.nn.Module, old_quant_method: FusedMoEMethodBase, prepare_finalize: FusedMoEPrepareAndFinalizeModular, - shared_experts: torch.nn.Module | None, + shared_experts: SharedExperts | None, inplace: bool = False, ) -> "FusedMoEModularMethod": return FusedMoEModularMethod( @@ -52,8 +55,7 @@ def make( FusedMoEKernel( prepare_finalize, old_quant_method.select_gemm_impl(prepare_finalize, moe_layer), - shared_experts, - moe_parallel_config=moe_layer.moe_parallel_config, + shared_experts=shared_experts, inplace=inplace, ), ) @@ -89,7 +91,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert self.moe_kernel is not None return self.moe_kernel.apply( hidden_states=x, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index c6c6dab9dd04..bbf6e2ffedb9 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -42,6 +42,9 @@ from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import ( DefaultMoERunner, ) +from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( + SharedExperts, +) from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( UnquantizedFusedMoEMethod, ) @@ -337,7 +340,6 @@ def __init__( super().__init__() self._gate = gate - self._shared_experts = shared_experts self._routed_input_transform = routed_input_transform if params_dtype is None: @@ -565,7 +567,7 @@ def __init__( device=vllm_config.device_config.device, routing_method=self.routing_method_type, # TODO: in_dtype == out_dtype? - disable_inplace=disable_inplace() or self._shared_experts is not None, + disable_inplace=disable_inplace() or self.shared_experts is not None, ) if self.moe_config.use_mori_kernels: assert self.rocm_aiter_fmoe_enabled, ( @@ -632,22 +634,30 @@ def _get_quant_method() -> FusedMoEMethodBase: self.quant_method.create_weights(layer=self, **moe_quant_params) self.base_quant_method = self.quant_method - # Disable shared expert overlap if: - # - we are using eplb with non-default backend, because of correctness issues - # - we are using flashinfer with DP, since there nothing to gain - # - we are using marlin kernels - # backend = self.moe_parallel_config.all2all_backend - # self.use_overlapped = ( - # not ( - # (self.enable_eplb and backend != "allgather_reducescatter") - # or self.moe_parallel_config.use_fi_all2allv_kernels - # ) - # and self._shared_experts is not None - # ) - + self._shared_experts = shared_experts + self.shared_experts = self._init_shared_experts() self.runner = self._init_runner() - def _init_runner(self): + def _init_shared_experts(self) -> SharedExperts | None: + if self._shared_experts is None: + return None + + reduce_shared_output = ( + self.reduce_results + # XXXX ordering issue + and self.quant_method.moe_mk is not None + and self.quant_method.moe_mk.output_is_reduced() + ) + + return SharedExperts( + self._shared_experts, + moe_config=self.moe_config, + has_separate_shared_experts=not self.quant_method.mk_owns_shared_expert, + use_dp_chunking=self.use_dp_chunking, # XXXXXXXXXXXXXX + must_reduce_shared_expert_outputs=reduce_shared_output, + ) + + def _init_runner(self) -> DefaultMoERunner: # Storing the runner in the FusedMoE is an intermediate state, eventually # the runner will own the FusedMoE layer and provide the execution interface # for MoE ops. @@ -657,7 +667,7 @@ def _init_runner(self): router=self.router, routed_input_transform=self._routed_input_transform, gate=self._gate, - shared_experts=self._shared_experts, + shared_experts=self.shared_experts, quant_method=self.quant_method, reduce_results=self.reduce_results, enable_dbo=self.vllm_config.parallel_config.enable_dbo, @@ -700,26 +710,17 @@ def maybe_init_modular_kernel(self) -> None: self, self.base_quant_method, prepare_finalize, - self.runner.get_shared_experts(), # XXXXXXXXXXXXXXXXX + self.runner._get_shared_experts(), # XXXXXXXXXXXXXXXXX inplace=not self.moe_config.disable_inplace, ) ) - # @property - # def shared_experts(self) -> torch.nn.Module | None: - # return self._shared_experts if self.use_overlapped else None - - # TODO(bnell): is this needed? - # @property - # def layer_id(self): - # # Delayed import to avoid circular dependency - # from vllm.model_executor.models.utils import extract_layer_index - - # return extract_layer_index(self.layer_name) + @property + def layer_id(self): + # Delayed import to avoid circular dependency + from vllm.model_executor.models.utils import extract_layer_index - # @property - # def gate(self) -> torch.nn.Module | None: - # return self._gate if self.use_overlapped else None + return extract_layer_index(self.layer_name) @property def tp_size(self): diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index a6b498834017..bbafc682c102 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -20,6 +20,10 @@ FusedMoEQuantConfig, RoutingMethodType, ) +from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( + SharedExperts, + SharedExpertsOrder, +) from vllm.model_executor.layers.fused_moe.utils import ( _resize_cache, disable_inplace, @@ -234,6 +238,13 @@ def output_is_reduced(self) -> bool: """ raise NotImplementedError + def supports_async(self) -> bool: + """ + Indicates whether or not this class implements prepare_async and + finalize_async. + """ + return False + # TODO: pass FusedMoEParallelConfig in as ctor parameter? class FusedMoEPrepareAndFinalizeModular(FusedMoEPrepareAndFinalize): @@ -280,13 +291,6 @@ def prepare( """ raise NotImplementedError - def supports_async(self) -> bool: - """ - Indicates whether or not this class implements prepare_async and - finalize_async. - """ - return False - def prepare_async( self, a1: torch.Tensor, @@ -991,8 +995,7 @@ def __init__( self, prepare_finalize: FusedMoEPrepareAndFinalizeModular, fused_experts: FusedMoEExpertsModular, - shared_experts: torch.nn.Module | None = None, - moe_parallel_config: FusedMoEParallelConfig | None = None, + shared_experts: SharedExperts | None, inplace: bool = False, ): self.prepare_finalize = prepare_finalize @@ -1000,6 +1003,9 @@ def __init__( self.shared_experts = shared_experts self.moe_parallel_config = moe_parallel_config self.inplace = inplace + + moe_parallel_config = fused_experts.moe_config.moe_parallel_config + self.moe_parallel_config: FusedMoEParallelConfig | None = moe_parallel_config self.is_dp_ep = ( moe_parallel_config is not None and moe_parallel_config.dp_size > 1 @@ -1069,6 +1075,17 @@ def _allocate_buffers( return workspace13, workspace2, fused_out + def _maybe_apply_shared_experts( + self, + shared_experts_input: torch.Tensor | None, + ): + if self.shared_experts is not None: + assert shared_experts_input is not None + self.shared_experts.apply( + shared_experts_input, + SharedExpertsOrder.INTERNAL, + ) + def _prepare( self, hidden_states: torch.Tensor, @@ -1241,15 +1258,6 @@ def _finalize( shared_experts_input is the original hidden_states (full dimension) needed by the shared expert MLP. """ - shared_output: torch.Tensor | None = None - - # For latent MoE: shared experts need the original hidden_states - # (full hidden_size), not the latent-projected version used by - # routed experts. - se_hidden_states = ( - shared_experts_input if shared_experts_input is not None else hidden_states - ) - if not self.prepare_finalize.supports_async(): assert not dbo_enabled() @@ -1261,8 +1269,7 @@ def _finalize( apply_router_weight_on_input, self.fused_experts.finalize_weight_and_reduce_impl(), ) - if self.shared_experts is not None: - shared_output = self.shared_experts(se_hidden_states) + self._maybe_apply_shared_experts(shared_experts_input) else: finalize_ret = self.prepare_finalize.finalize_async( output, @@ -1272,8 +1279,7 @@ def _finalize( apply_router_weight_on_input, self.fused_experts.finalize_weight_and_reduce_impl(), ) - if self.shared_experts is not None: - shared_output = self.shared_experts(se_hidden_states) + self._maybe_apply_shared_experts(shared_experts_input) # TODO(lucas): refactor this in the alternative schedules followup # currently unpack if we have hook + receiver pair or just @@ -1296,11 +1302,7 @@ def _finalize( receiver() - if self.shared_experts is None: - return output - else: - assert shared_output is not None - return shared_output, output + return output def apply( self, @@ -1314,7 +1316,7 @@ def apply( expert_map: torch.Tensor | None = None, apply_router_weight_on_input: bool = False, shared_experts_input: torch.Tensor | None = None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: """ This function computes a Mixture of Experts (MoE) layer using two sets of weights, w1 and w2, and top-k gating mechanism. diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py index a63c02663886..57431b946e24 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py +++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py @@ -18,6 +18,9 @@ fp8_w8a8_moe_quant_config, fp8_w8a16_moe_quant_config, ) +from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( + SharedExperts, +) from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( FlashinferMoeBackend, get_flashinfer_moe_backend, @@ -545,7 +548,7 @@ def make_fp8_moe_kernel( experts_cls: type[mk.FusedMoEExperts], fp8_backend: Fp8MoeBackend, routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, - shared_experts: torch.nn.Module | None = None, + shared_experts: SharedExperts | None = None, ) -> mk.FusedMoEKernel: # Create Prepare/Finalize. prepare_finalize = maybe_make_prepare_finalize( @@ -583,10 +586,8 @@ def make_fp8_moe_kernel( experts, shared_experts=( shared_experts - if moe_config.moe_parallel_config.use_deepep_ll_kernels - else None + if prepare_finalize.supports_async() else None ), - moe_parallel_config=moe_config.moe_parallel_config, inplace=( not moe_config.disable_inplace and fp8_backend != Fp8MoeBackend.FLASHINFER_CUTLASS diff --git a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py index 8a224cb39e7c..194513981e41 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py +++ b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py @@ -18,6 +18,9 @@ nvfp4_moe_quant_config, nvfp4_w4a16_moe_quant_config, ) +from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( + SharedExperts, +) from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( prepare_nvfp4_moe_layer_for_fi_or_cutlass, ) @@ -397,7 +400,7 @@ def make_nvfp4_moe_kernel( moe_config: FusedMoEConfig, experts_cls: type[mk.FusedMoEExperts], routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, - shared_experts: torch.nn.Module | None = None, + shared_experts: SharedExperts | None = None, ) -> mk.FusedMoEKernel: # Create Prepare/Finalize. prepare_finalize = maybe_make_prepare_finalize( @@ -435,10 +438,8 @@ def make_nvfp4_moe_kernel( experts, shared_experts=( shared_experts - if moe_config.moe_parallel_config.use_deepep_ll_kernels - else None + if prepare_finalize.supports_async() else None ), - moe_parallel_config=moe_config.moe_parallel_config, inplace=False, ) diff --git a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py index 9c31da10dd94..926819592c3a 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py +++ b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py @@ -224,6 +224,7 @@ def make_unquantized_moe_kernel( moe_config=moe_config, quant_config=quant_config, ), + shared_experts=None, inplace=False, ) @@ -238,6 +239,7 @@ def make_unquantized_moe_kernel( moe_config=moe_config, quant_config=quant_config, ), + shared_experts=None, inplace=not moe_config.disable_inplace, ) elif backend == UnquantizedMoeBackend.TRITON: @@ -249,6 +251,7 @@ def make_unquantized_moe_kernel( moe_config=moe_config, quant_config=quant_config, ), + shared_experts=None, inplace=not moe_config.disable_inplace, ) elif backend == UnquantizedMoeBackend.XPU: @@ -260,6 +263,7 @@ def make_unquantized_moe_kernel( moe_config=moe_config, quant_config=quant_config, ), + shared_experts=None, inplace=not moe_config.disable_inplace, ) return kernel diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index b0b59bbb0c1e..cc177b946cf1 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -3,7 +3,6 @@ from collections.abc import Callable from contextlib import nullcontext from typing import TYPE_CHECKING -from enum import IntEnum import torch import torch.nn.functional as F @@ -12,7 +11,6 @@ from vllm.distributed import ( get_ep_group, get_pcp_group, - get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) from vllm.forward_context import ( @@ -31,6 +29,10 @@ FusedMoERouter, ) from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner +from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( + SharedExperts, + SharedExpertsOrder, +) from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import ( @@ -45,23 +47,6 @@ logger = init_logger(__name__) -class SharedExpertsOrder(IntEnum): - # No shared experts. - NONE = (0,) - - # Get rid of this one? combine with BEFORE? - EXTERNAL = (1,) - - # Called by modular kernel. - INTERNAL = (2,) - - # Called right before quant_method is executed. - BEFORE_QUANT_METHOD = (3,) - - # Called right after quant_method is executed (possibly with streaming). - AFTER_QUANT_METHOD = (4,) - - def get_layer_from_name(layer_name: str) -> torch.nn.Module: forward_context: ForwardContext = get_forward_context() if layer_name == "from_forward_context": @@ -154,7 +139,7 @@ def _moe_forward_shared_fake( direct_register_custom_op( op_name="moe_forward", op_func=_moe_forward, - mutates_args=["hidden_states"], + mutates_args=["hidden_states"], # ? fake_impl=_moe_forward_fake, tags=(torch.Tag.needs_fixed_stride_order,), ) @@ -163,7 +148,7 @@ def _moe_forward_shared_fake( direct_register_custom_op( op_name="moe_forward_shared", op_func=_moe_forward_shared, - mutates_args=["hidden_states"], + mutates_args=["hidden_states"], # ? fake_impl=_moe_forward_shared_fake, tags=(torch.Tag.needs_fixed_stride_order,), ) @@ -198,7 +183,7 @@ def __init__( router: FusedMoERouter, routed_input_transform: torch.nn.Module | None, gate: torch.nn.Module | None, - shared_experts: torch.nn.Module | None, + shared_experts: SharedExperts | None, quant_method: FusedMoEMethodBase, reduce_results: bool, enable_dbo: bool, @@ -224,23 +209,6 @@ def __init__( self.batched_router_logits: torch.Tensor | None = None self._maybe_init_dp_chunking() - # Allow disabling of the separate shared experts stream for - # debug purposes. - # TODO: Remove this after more extensive testings with TP/DP - # and other execution modes - self.use_shared_experts_stream = False - if envs.VLLM_DISABLE_SHARED_EXPERTS_STREAM: - logger.debug_once("Disabling MoE shared_experts cuda stream", scope="local") - self.shared_experts_stream = None - else: - # TODO(rob): enable shared expert overlap with non-cuda-alike. - # aux_stream() returns None on non-cuda-alike platforms. - self.shared_experts_stream = aux_stream() - if self.shared_experts_stream is not None: - logger.debug_once( - "Enabled separate cuda stream for MoE shared_experts", scope="local" - ) - self.use_dp_chunking = ( self.moe_config.moe_parallel_config.use_pplx_kernels or self.moe_config.moe_parallel_config.use_deepep_ll_kernels @@ -254,57 +222,9 @@ def __init__( self.moe_forward = self._select_forward(layer) - def _has_external_experts(self) -> bool: - # Disable shared expert overlap if: - # - we are using eplb with non-default backend, because of correctness issues - # - we are using flashinfer with DP, since there nothing to gain - backend = self.moe_config.moe_parallel_config.all2all_backend - return self.shared_experts is not None and not ( - (self.enable_eplb and backend != "allgather_reducescatter") - or self.moe_config.moe_parallel_config.use_fi_all2allv_kernels - ) - - def _determine_shared_experts_order( - self, - hidden_states: torch.Tensor, - ) -> tuple[SharedExpertsOrder, bool]: - if self.shared_experts is None: - return SharedExpertsOrder.NONE, False - - if self._has_external_experts(): - return SharedExpertsOrder.EXTERNAL, False - - has_separate_shared_experts = not self.quant_method.mk_owns_shared_expert - - if not has_separate_shared_experts: - return SharedExpertsOrder.INTERNAL, False - - allow_shared_experts_stream = ( - current_platform.is_cuda() - and has_separate_shared_experts - and not self.use_dp_chunking - and self.shared_experts_stream is not None - and hidden_states.shape[0] - <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD - ) - - # Check if we need to run shared experts before matrix multiply because - # matrix multiply may modify the hidden_states. - run_shared_experts_before = ( - has_separate_shared_experts and not allow_shared_experts_stream - ) - - if run_shared_experts_before: - return SharedExpertsOrder.BEFORE_QUANT_METHOD, False - else: - return SharedExpertsOrder.AFTER_QUANT_METHOD, allow_shared_experts_stream - # XXXXXX used by layer.py and lora/layers/fused_moe.py - def get_shared_experts(self) -> torch.nn.Module | None: - if self._has_external_experts(): - return self.shared_experts - else: - return None + def _get_shared_experts(self) -> SharedExperts | None: + return self.shared_experts def _select_forward(self, layer: torch.nn.Module) -> Callable: if current_platform.is_tpu() or current_platform.is_cpu(): @@ -319,33 +239,6 @@ def _select_forward(self, layer: torch.nn.Module) -> Callable: else torch.ops.vllm.moe_forward_shared ) - def _setup_shared_experts_stream( - self, - hidden_states: torch.Tensor, - shared_input: torch.Tensor | None, - ): - if self.use_shared_experts_stream: - assert self.shared_experts_stream is not None - assert self.moe_config.disable_inplace - - shared_experts_input = ( - shared_input if shared_input is not None else hidden_states - ) - - # Record that the shared_experts_input will be used in the - # shared_experts_stream to avoid gc issue from - # deallocation. For more details: - # https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501 - # NOTE: We don't need shared_output.record_stream(current_stream()) - # because we synch the streams before using shared_output. - shared_experts_input.record_stream(self.shared_experts_stream) - - # Mark sync start point for the separate shared experts - # stream here since we want to run in parallel with the - # router/gate (next op below) - assert self.shared_experts_stream is not None - self.shared_experts_stream.wait_stream(current_stream()) - def _maybe_init_dp_chunking(self): if not self.use_dp_chunking: return @@ -376,49 +269,6 @@ def _maybe_init_dp_chunking(self): device=device, ) - def _maybe_apply_shared_experts( - self, - shared_output: torch.Tensor | None, - hidden_states: torch.Tensor, - shared_input: torch.Tensor | None, - order: SharedExpertsOrder, - ) -> torch.Tensor | None: - experts_order, use_shared_experts_stream = self._determine_shared_experts_order( - hidden_states - ) - if order != experts_order: - return None - - assert self.shared_experts is not None - assert shared_output is None - if order == SharedExpertsOrder.AFTER_QUANT_METHOD and use_shared_experts_stream: - # TODO: fold this in? - hidden_states = self._setup_shared_experts_stream( - hidden_states, - shared_input, - ) - - # Run shared experts in parallel on a separate stream - # NOTE: We start the separate stream here and mark the - # sync end point immediately after it is done. This is - # important to avoid excessive stream allocations by the cuda - # graph replay later. - with torch.cuda.stream(self.shared_experts_stream): - # Note that hidden_states clone() is necessary here to avoid - # conflict with the main stream - shared_output = self.shared_experts(hidden_states) - current_stream().wait_stream(self.shared_experts_stream) - else: - shared_output = self.shared_experts(hidden_states) - - if order == SharedExpertsOrder.EXTERNAL: - # TODO: figure out how to combine this with maybe_reduce_output? - # or get rid of it completely..... - assert shared_output is not None - shared_output = self._maybe_reduce_shared_out(shared_output) - - return shared_output - def must_reduce_shared_expert_outputs(self) -> bool: """ The shared_experts are typically computed using the RowParallelLinear @@ -446,7 +296,9 @@ def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tens else: return tensor_model_parallel_all_reduce(final_hidden_states) - def apply_routed_input_transform(self, hidden_states: torch.Tensor) -> torch.Tensor: + def apply_routed_input_transform( + self, hidden_states: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor | None]: """Apply transform for routed experts (e.g., latent projection). This is called by FusedMoE.forward_native. The original hidden_states @@ -462,21 +314,13 @@ def apply_routed_input_transform(self, hidden_states: torch.Tensor) -> torch.Ten # ReplicatedLinear returns (output, extra_bias) tuple. # We only need the output tensor; extra_bias is not used here. if isinstance(result, tuple): - return result[0] - return result - return hidden_states - - # TODO: combine with runner.forward_impl_X - def _maybe_reduce_shared_out(self, shared_out: torch.Tensor) -> torch.Tensor: - # Reduce shared expert outputs if necessary, since the MLP - # should have been created with reduce_results=False. - if ( - self.reduce_results - and get_tensor_model_parallel_world_size() > 1 - and self.must_reduce_shared_expert_outputs() - ): - shared_out = tensor_model_parallel_all_reduce(shared_out) - return shared_out + return result[0], hidden_states + return result, hidden_states + + return ( + hidden_states, + hidden_states if self.shared_experts is not None else None, + ) # XXXXX def _maybe_reduce_output( self, @@ -520,14 +364,10 @@ def _encode_layer_name(self) -> str | ModuleName: def _maybe_pad_hidden_states( self, - original_hidden_states: torch.Tensor | None, + shared_experts_input: torch.Tensor, hidden_states: torch.Tensor, ) -> tuple[torch.Tensor, list[int]]: - original_hidden_dim = ( - original_hidden_states.shape[-1] - if original_hidden_states is not None - else hidden_states.shape[-1] - ) + shared_experts_hidden_dim = shared_experts_input.shape[-1] transformed_hidden_dim = hidden_states.shape[-1] if ( not self.quant_method.skip_forward_padding @@ -541,26 +381,31 @@ def _maybe_pad_hidden_states( ) if self.shared_experts is not None: - orig_hidden_dims = [original_hidden_dim, transformed_hidden_dim] + orig_hidden_dims = [shared_experts_hidden_dim, transformed_hidden_dim] else: orig_hidden_dims = [transformed_hidden_dim] return hidden_states, orig_hidden_dims + def _maybe_apply_shared_experts( + self, + shared_experts_input: torch.Tensor | None, + order: SharedExpertsOrder, + ): + if self.shared_experts is not None: + assert shared_experts_input is not None + self.shared_experts.apply(shared_experts_input, order) + def _apply_quant_method( self, layer: torch.nn.Module, - shared_output: torch.Tensor | None, hidden_states: torch.Tensor, router_logits: torch.Tensor, - shared_input: torch.Tensor | None, + shared_experts_input: torch.Tensor | None, ) -> tuple[torch.Tensor | None, torch.Tensor]: # Run this before quant_method to avoid inplace issues. - shared_input = shared_input if shared_input is not None else hidden_states - shared_output = self._maybe_apply_shared_experts( - shared_output, - shared_input, - shared_input, + self._maybe_apply_shared_experts( + shared_experts_input, SharedExpertsOrder.BEFORE_QUANT_METHOD, ) @@ -581,28 +426,18 @@ def _apply_quant_method( x=hidden_states, topk_weights=topk_weights, topk_ids=topk_ids, - shared_experts_input=shared_input, + shared_experts_input=shared_experts_input, ) - # TODO: try to clean this up - if isinstance(fused_out, tuple): - if fused_out[0] is None: - assert shared_output is not None - hidden_states = fused_out[1] - else: - assert shared_output is None - shared_output, hidden_states = fused_out - else: - hidden_states = fused_out - - shared_output = self._maybe_apply_shared_experts( - shared_output, - shared_input, - shared_input, + self._maybe_apply_shared_experts( + shared_experts_input, SharedExpertsOrder.AFTER_QUANT_METHOD, ) - return shared_output, hidden_states + return ( + self.shared_experts.output if self.shared_experts is not None else None, + fused_out, + ) def _sequence_parallel_context(self): ctx = get_forward_context() @@ -719,33 +554,28 @@ def _maybe_combine( # # forward # - self.moe_forward (_moe_forward or _moe_forward_shared) - # - forward_chunking_dispatch + # - forward_dispatch # - forward_impl or forward_impl_chunked + # def forward( self, hidden_states: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - # For latent MoE: save ORIGINAL hidden_states before transform - # (shared_experts need original dimension, routed experts use transformed) - original_hidden_states = ( - # or shared_experts is not None? - hidden_states if self.routed_input_transform is not None else None - ) - # Apply transform for routed experts (e.g., latent projection for latent MoE) - hidden_states = self.apply_routed_input_transform(hidden_states) + hidden_states, shared_experts_input = self.apply_routed_input_transform( + hidden_states + ) hidden_states, og_hidden_dims = self._maybe_pad_hidden_states( - original_hidden_states, + shared_experts_input, hidden_states, ) - # Note: moe_forward will call forward_dispatch fused_output = self.moe_forward( hidden_states, router_logits, - original_hidden_states, + shared_experts_input, self._encode_layer_name(), ) @@ -756,17 +586,15 @@ def forward_dispatch( layer: torch.nn.Module, hidden_states: torch.Tensor, router_logits: torch.Tensor, - shared_input: torch.Tensor | None, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: # TODO(bnell): this can be removed after MK migration is complete. layer.ensure_moe_quant_config_init() router_logits = self._maybe_gate(hidden_states, router_logits) - shared_output = self._maybe_apply_shared_experts( - None, - hidden_states, - shared_input, + self._maybe_apply_shared_experts( + shared_experts_input, SharedExpertsOrder.EXTERNAL, ) @@ -774,18 +602,16 @@ def forward_dispatch( if self.use_dp_chunking: return self.forward_impl_chunked( layer, - shared_output, hidden_states, router_logits, - shared_input, + shared_experts_input, ) else: return self.forward_impl( layer, - shared_output, hidden_states, router_logits, - shared_input, + shared_experts_input, ) def _slice_and_copy_input( @@ -811,10 +637,9 @@ def _slice_and_copy_input( def forward_impl_chunked( self, layer: torch.nn.Module, - shared_output: torch.Tensor | None, hidden_states: torch.Tensor, router_logits: torch.Tensor, - shared_input: torch.Tensor | None, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: # Gate overlap not supported when chunking is enabled. Run the # gate first. @@ -865,18 +690,17 @@ def forward_impl_chunked( chunk_end, ) - shared_input_chunk = ( - shared_input[chunk_start:chunk_end, :] - if shared_input is not None + shared_experts_input_chunk = ( + shared_experts_input[chunk_start:chunk_end, :] + if shared_experts_input is not None else None ) shared_output_chunk, hidden_states_chunk = self._apply_quant_method( layer=layer, - shared_output=shared_output, hidden_states=hidden_states_chunk, router_logits=router_logits_chunk, - shared_input=shared_input_chunk, + shared_experts_input=shared_experts_input_chunk, ) # Store outputs @@ -901,10 +725,9 @@ def forward_impl_chunked( def forward_impl( self, layer: torch.nn.Module, - shared_output: torch.Tensor | None, hidden_states: torch.Tensor, router_logits: torch.Tensor, - shared_input: torch.Tensor | None, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: # TODO(bnell): parts of the dispatch/combine steps will go away once # #32567 lands and the remaining kernels are made MKs. The PCP @@ -917,10 +740,9 @@ def forward_impl( shared_output, hidden_states = self._apply_quant_method( layer=layer, - shared_output=shared_output, hidden_states=hidden_states, router_logits=router_logits, - shared_input=shared_input, + shared_experts_input=shared_experts_input, ) return self._maybe_combine( diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py new file mode 100644 index 000000000000..ad1bac4dbb03 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -0,0 +1,207 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from enum import IntEnum + +import torch + +import vllm.envs as envs +from vllm.distributed import ( + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, +) +from vllm.platforms import current_platform +from vllm.utils.torch_utils import ( + aux_stream, + current_stream, +) + +logger = init_logger(__name__) + + +class SharedExpertsOrder(IntEnum): + # No shared experts. + NONE = (0,) + + # Get rid of this one? combine with BEFORE? + # Note: this might be important for torch.compile reasons. Can + # get rid of it after _moe_forward is undone. + EXTERNAL = (1,) + + # Called by modular kernel. + INTERNAL = (2,) + + # Called right before quant_method is executed. + BEFORE_QUANT_METHOD = (3,) + + # Called right after quant_method is executed (possibly with streaming). + AFTER_QUANT_METHOD = (4,) + + +# XXXXX add method to prime with shared_experts_input? NO +class SharedExperts: + def __init__( + self, + shared_experts: torch.nn.Module, + moe_config: FusedMoEConfig, + has_separate_shared_experts: bool, # better name + use_dp_chunking: bool, + must_reduce_shared_expert_outputs: bool, # and reduce_results + ): + self._output: torch.Tensor | None = None + self._shared_experts = shared_experts + self._moe_config = moe_config # invariant + self._use_dp_chunking = use_dp_chunking # invariant + + self._has_separate_shared_experts = has_separate_shared_experts # depends on MK + self._must_reduce_shared_expert_outputs = ( + must_reduce_shared_expert_outputs # depends on MK + ) + + # Allow disabling of the separate shared experts stream for + # debug purposes. + # TODO: Remove this after more extensive testings with TP/DP + # and other execution modes + self.use_shared_experts_stream = False + if envs.VLLM_DISABLE_SHARED_EXPERTS_STREAM: + logger.debug_once("Disabling MoE shared_experts cuda stream", scope="local") + self._shared_experts_stream = None + else: + # TODO(rob): enable shared expert overlap with non-cuda-alike. + # aux_stream() returns None on non-cuda-alike platforms. + self._shared_experts_stream = aux_stream() + if self._shared_experts_stream is not None: + logger.debug_once( + "Enabled separate cuda stream for MoE shared_experts", scope="local" + ) + + def _has_external_experts(self) -> bool: + # Disable shared expert overlap if: + # - we are using eplb with non-default backend, because of correctness issues + # - we are using flashinfer with DP, since there nothing to gain + backend = self._moe_config.moe_parallel_config.all2all_backend + return self._shared_experts is not None and not ( + ( + self._moe_config.moe_parallel_config.enable_eplb + and backend != "allgather_reducescatter" + ) + or self._moe_config.moe_parallel_config.use_fi_all2allv_kernels + ) + + def _determine_shared_experts_order( + self, + hidden_states: torch.Tensor, + ) -> tuple[SharedExpertsOrder, bool]: + if self._shared_experts is None: + return SharedExpertsOrder.NONE, False + + if self._has_external_experts(): + return SharedExpertsOrder.EXTERNAL, False + + if ( + not self._has_separate_shared_experts + or not self._moe_config.moe_parallel_config.use_all2all_kernels + ): + return SharedExpertsOrder.INTERNAL, False + + allow_shared_experts_stream = ( + current_platform.is_cuda() + and self._has_separate_shared_experts + and not self._use_dp_chunking + and self._shared_experts_stream is not None + and hidden_states.shape[0] + <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD + ) + + # Check if we need to run shared experts before matrix multiply because + # matrix multiply may modify the hidden_states. + run_shared_experts_before = ( + self._has_separate_shared_experts and not allow_shared_experts_stream + ) + + if run_shared_experts_before: + return SharedExpertsOrder.BEFORE_QUANT_METHOD, False + else: + return SharedExpertsOrder.AFTER_QUANT_METHOD, allow_shared_experts_stream + + def _setup_shared_experts_stream( + self, + shared_experts_input: torch.Tensor, + ): + assert self._shared_experts_stream is not None + assert self._moe_config.disable_inplace + + # Record that the clone will be used by shared_experts_stream + # to avoid gc issue from deallocation of hidden_states_clone + # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501 + # NOTE: We don't need shared_output.record_stream(current_stream()) + # because we synch the streams before using shared_output. + shared_experts_input.record_stream(self._shared_experts_stream) + + # Mark sync start point for the separate shared experts + # stream here since we want to run in parallel with the + # router/gate (next op below) + assert self._shared_experts_stream is not None + self._shared_experts_stream.wait_stream(current_stream()) + + def _maybe_reduce_shared_out(self, shared_out: torch.Tensor) -> torch.Tensor: + # Reduce shared expert outputs if necessary, since the MLP + # should have been created with reduce_results=False. + if ( + self._must_reduce_shared_expert_outputs + and get_tensor_model_parallel_world_size() > 1 + ): + shared_out = tensor_model_parallel_all_reduce(shared_out) + return shared_out + + @property + def output(self) -> torch.Tensor | None: + assert self._shared_experts is None == self._output is None + output = self._output + self._output = None + return output + + # add split/join? + + def apply( + self, + shared_experts_input: torch.Tensor, + order: SharedExpertsOrder, + ) -> torch.Tensor | None: + experts_order, use_shared_experts_stream = self._determine_shared_experts_order( + shared_experts_input, + ) + if order != experts_order: + return None + + assert self._shared_experts is not None + assert self._output is None + if order == SharedExpertsOrder.AFTER_QUANT_METHOD and use_shared_experts_stream: + # TODO: fold this in? + self._setup_shared_experts_stream(shared_experts_input) + + # Run shared experts in parallel on a separate stream + # NOTE: We start the separate stream here and mark the + # sync end point immediately after it is done. This is + # important to avoid excessive stream allocations by the cuda + # graph replay later. + with torch.cuda.stream(self._shared_experts_stream): + # Note that hidden_states clone() is necessary here to avoid + # conflict with the main stream + self._output = self._shared_experts(shared_experts_input) + current_stream().wait_stream(self._shared_experts_stream) + else: + self._output = self._shared_experts(shared_experts_input) + + if order == SharedExpertsOrder.EXTERNAL: + # TODO: figure out how to combine this with maybe_reduce_output? + # or get rid of it completely..... + assert self._output is not None + self._output = self._maybe_reduce_shared_out(self._output) + + # TODO: do AFTER reduce here? + + return self._output # ? diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index a29d8a7d8dda..e2a5d05320d0 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -93,7 +93,7 @@ def forward_native( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: return self.forward_cuda(layer, x, topk_weights, topk_ids, shared_experts_input) @property @@ -297,7 +297,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: return self.forward( layer=layer, x=x, @@ -322,7 +322,7 @@ def forward_cuda( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert self.kernel is not None return self.kernel.apply( @@ -343,7 +343,7 @@ def forward_monolithic_cuda( layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, router_logits: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: import vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe # noqa: F401 assert self.unquantized_backend == UnquantizedMoeBackend.FLASHINFER_TRTLLM @@ -369,7 +369,7 @@ def forward_monolithic_cpu( layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, router_logits: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: return self.cpu_fused_moe( layer, x, diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 5b7af3193b03..da717556adbd 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -765,7 +765,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: return fused_marlin_moe( x, layer.w13_qweight, diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 716a20090f69..729924663646 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -483,7 +483,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe import fused_experts # TODO(bnell): Do these need to be called on the hot path? diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 29115fbbc255..8fa25efd1da0 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -349,7 +349,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert self.moe_kernel is not None return self.moe_kernel.apply( x, @@ -597,7 +597,7 @@ def apply_monolithic( layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert self.is_monolithic assert self.moe_kernel is not None return self.moe_kernel.apply_monolithic( @@ -622,7 +622,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert self.moe_kernel is not None return self.moe_kernel.apply( x, @@ -959,7 +959,7 @@ def apply_monolithic( layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert self.moe_kernel is not None return self.moe_kernel.apply_monolithic( x, @@ -983,7 +983,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert not self.is_monolithic assert self.moe_kernel is not None return self.moe_kernel.apply( @@ -1123,7 +1123,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe import fused_experts return fused_experts( @@ -1607,7 +1607,7 @@ def apply_monolithic( layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert self.kernel_backend == "Flashinfer" return flashinfer_trtllm_mxint4_moe( x=x, @@ -1634,7 +1634,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert self.kernel_backend == "Marlin" return fused_marlin_moe( x, @@ -1883,7 +1883,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe import fused_experts return fused_experts( @@ -2500,7 +2500,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: if layer.enable_eplb: raise NotImplementedError( "EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet." diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index d971f3b5b0d2..301441ff019d 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -141,7 +141,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe import fused_experts return fused_experts( diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 5101347cd02a..f967e41a79b3 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -961,7 +961,7 @@ def apply_monolithic( layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert self.is_monolithic assert self.moe_kernel is not None return self.moe_kernel.apply_monolithic( @@ -986,7 +986,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert not self.is_monolithic assert self.moe_kernel is not None return self.moe_kernel.apply( diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 88023349e779..cf72c83d717a 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -637,7 +637,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: if layer.apply_router_weight_on_input: raise NotImplementedError( "Apply router weight on input is not supported for" diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index d7b2a366e1f0..0b43fbd392c9 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -901,7 +901,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: return fused_marlin_moe( x, layer.w13_qweight, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 78644f74d288..a5e1ea52f557 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -932,7 +932,7 @@ def apply_monolithic( layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert self.is_monolithic assert self.moe_kernel is not None return self.moe_kernel.apply_monolithic( @@ -957,7 +957,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert not self.is_monolithic assert self.moe_kernel is not None return self.moe_kernel.apply( @@ -1416,7 +1416,7 @@ def apply_monolithic( layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert self.is_monolithic assert self.moe_kernel is not None return self.moe_kernel.apply_monolithic( @@ -1441,7 +1441,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert not self.is_monolithic assert self.moe_kernel is not None return self.moe_kernel.apply( diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index f5c679840432..a327ac17bbc9 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -369,7 +369,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe import fused_experts assert layer.activation == MoEActivation.SILU, ( diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index f992d0f86c4e..16536b85caea 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -1051,7 +1051,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert not self.is_monolithic if layer.enable_eplb: raise NotImplementedError("EPLB is not supported for mxfp4") @@ -1096,7 +1096,7 @@ def apply_monolithic( layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert self.is_monolithic if layer.enable_eplb: diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 0a5db4e71fdb..fcf03d330d11 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -445,7 +445,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: if self.rocm_aiter_moe_enabled: from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( rocm_aiter_fused_experts, @@ -634,7 +634,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( rocm_aiter_fused_experts, ) @@ -1042,7 +1042,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: if not self.emulate: from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( rocm_aiter_fused_experts, From e8865e6c9a7e2e5dc8ccd1cda73f231d69a72528 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 23 Feb 2026 18:47:11 -0500 Subject: [PATCH 012/191] cleanups Signed-off-by: Bill Nell --- .../model_executor/layers/fused_moe/config.py | 10 ++ vllm/model_executor/layers/fused_moe/layer.py | 20 ++-- .../fused_moe/runner/default_moe_runner.py | 98 +++++++++++-------- .../layers/fused_moe/runner/shared_experts.py | 76 ++++++++------ .../model_executor/models/transformers/moe.py | 2 + 5 files changed, 121 insertions(+), 85 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 2500387debe1..cfe50a36cf8e 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -937,6 +937,16 @@ class FusedMoEParallelConfig: all2all_backend: str # all2all backend for MoE communication enable_eplb: bool # whether to enable expert load balancing + @property + def use_dp_chunking(self) -> bool: + return ( + or self.use_deepep_ll_kernels + or self.use_mori_kernels + or self.use_fi_all2allv_kernels + or self.use_fi_nvl_two_sided_kernels + or self.use_nixl_ep_kernels + ) and envs.VLLM_ENABLE_MOE_DP_CHUNK + @property def is_sequence_parallel(self) -> bool: return self.sp_size > 1 diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index bbf6e2ffedb9..c8328ec9b63f 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -642,19 +642,16 @@ def _init_shared_experts(self) -> SharedExperts | None: if self._shared_experts is None: return None - reduce_shared_output = ( - self.reduce_results - # XXXX ordering issue - and self.quant_method.moe_mk is not None - and self.quant_method.moe_mk.output_is_reduced() - ) - return SharedExperts( self._shared_experts, moe_config=self.moe_config, - has_separate_shared_experts=not self.quant_method.mk_owns_shared_expert, - use_dp_chunking=self.use_dp_chunking, # XXXXXXXXXXXXXX - must_reduce_shared_expert_outputs=reduce_shared_output, + # Note: For now we must pass quant_method along to SharedExperts so it + # can property determine where the shared experts are supposed to be + # called, i.e. by a MK or by the MoERunner. + # Once the MK can be created upfront, we can just pass in the proper + # flags dervied from the quant_method's MK. + reduce_results=self.reduce_results, + quant_method=self.quant_method, ) def _init_runner(self) -> DefaultMoERunner: @@ -671,7 +668,6 @@ def _init_runner(self) -> DefaultMoERunner: quant_method=self.quant_method, reduce_results=self.reduce_results, enable_dbo=self.vllm_config.parallel_config.enable_dbo, - enable_eplb=self.enable_eplb, ) # TODO(bnell): This method is provided as a hook so vllm/lora/layers/fused_moe.py @@ -710,7 +706,7 @@ def maybe_init_modular_kernel(self) -> None: self, self.base_quant_method, prepare_finalize, - self.runner._get_shared_experts(), # XXXXXXXXXXXXXXXXX + self.shared_experts, inplace=not self.moe_config.disable_inplace, ) ) diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index cc177b946cf1..5822fa7bcf26 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -7,7 +7,6 @@ import torch import torch.nn.functional as F -import vllm.envs as envs from vllm.distributed import ( get_ep_group, get_pcp_group, @@ -187,7 +186,6 @@ def __init__( quant_method: FusedMoEMethodBase, reduce_results: bool, enable_dbo: bool, - enable_eplb: bool, ): super().__init__() self.moe_config = moe_config @@ -198,10 +196,10 @@ def __init__( self.quant_method = quant_method self.reduce_results = reduce_results self.enable_dbo = enable_dbo - self.enable_eplb = enable_eplb + self.enable_eplb = moe_config.moe_parallel_config.enable_eplb # Chunked all2all staging tensor - # TODO rename these + # TODO rename these? # These need to exist ahead of time due to CUDAgraph construction # needing a fixed buffer address. # TODO: these could be global, i.e. shared by all layers @@ -209,34 +207,31 @@ def __init__( self.batched_router_logits: torch.Tensor | None = None self._maybe_init_dp_chunking() - self.use_dp_chunking = ( - self.moe_config.moe_parallel_config.use_pplx_kernels - or self.moe_config.moe_parallel_config.use_deepep_ll_kernels - or self.moe_config.moe_parallel_config.use_mori_kernels - or self.moe_config.moe_parallel_config.use_fi_nvl_two_sided_kernels - or self.moe_config.moe_parallel_config.use_nixl_ep_kernels - ) and envs.VLLM_ENABLE_MOE_DP_CHUNK + self.use_dp_chunking = self.moe_config.moe_parallel_config.use_dp_chunking # Needed for string -> FusedMoE layer lookup in custom ops. self.layer_name = layer.layer_name - self.moe_forward = self._select_forward(layer) + self.moe_forward, self.moe_forward_impl = self._select_forward(layer) - # XXXXXX used by layer.py and lora/layers/fused_moe.py - def _get_shared_experts(self) -> SharedExperts | None: - return self.shared_experts - - def _select_forward(self, layer: torch.nn.Module) -> Callable: + def _select_forward(self, layer: torch.nn.Module) -> tuple[Callable, Callable]: + forward_impl_fn = ( + self.forward_impl_chunked if self.use_dp_chunking else self.forward_impl + ) if current_platform.is_tpu() or current_platform.is_cpu(): # TODO: Once the OOM issue for the TPU backend is resolved, we # will switch to using the moe_forward custom op. # Note: CPU doesn't require wrapped forward_impl. - return _moe_forward if self.shared_experts is None else _moe_forward_shared + return ( + _moe_forward if self.shared_experts is None else _moe_forward_shared, + forward_impl_fn, + ) return ( torch.ops.vllm.moe_forward if self.shared_experts is None - else torch.ops.vllm.moe_forward_shared + else torch.ops.vllm.moe_forward_shared, + forward_impl_fn, ) def _maybe_init_dp_chunking(self): @@ -308,6 +303,9 @@ def apply_routed_input_transform( TODO: For latent MoE bandwidth optimization, fc2_latent_proj could be moved inside SharedFusedMoE to all-reduce on the smaller latent dimension. + + Returns (possibly transformed) hidden states and the input for shared + experts (or None if there are no shared experts). """ if self.routed_input_transform is not None: result = self.routed_input_transform(hidden_states) @@ -320,7 +318,7 @@ def apply_routed_input_transform( return ( hidden_states, hidden_states if self.shared_experts is not None else None, - ) # XXXXX + ) def _maybe_reduce_output( self, @@ -364,10 +362,12 @@ def _encode_layer_name(self) -> str | ModuleName: def _maybe_pad_hidden_states( self, - shared_experts_input: torch.Tensor, + shared_experts_input: torch.Tensor | None, hidden_states: torch.Tensor, ) -> tuple[torch.Tensor, list[int]]: - shared_experts_hidden_dim = shared_experts_input.shape[-1] + shared_experts_hidden_dim = ( + shared_experts_input.shape[-1] if shared_experts_input is not None else 0 + ) transformed_hidden_dim = hidden_states.shape[-1] if ( not self.quant_method.skip_forward_padding @@ -551,17 +551,39 @@ def _maybe_combine( else: return hidden_states - # - # forward - # - self.moe_forward (_moe_forward or _moe_forward_shared) - # - forward_dispatch - # - forward_impl or forward_impl_chunked - # def forward( self, hidden_states: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + """Invoke the fused moe layer. + + Input: + - hidden_states + - router_logits + + Output: + - The new hidden_states. + or + - A tuple of (shared experts output, new hidden_states). + + Calling sequence + - forward + - self.moe_forward (_moe_forward or _moe_forward_shared custom op) + - forward_dispatch + - moe_forward_impl (forward_impl or forward_impl_chunked) + + Note: The existence of _moe_forward and _moe_forward_shared custom ops are due + to the following reasons: + 1. the chunking loop in forward_impl_chunked cannot be compiled by torch.compile + 2. pytorch cannot handle union types in custom op signatures so _moe_forward and + _moe_forward_shared must be split. + + If forward_impl_chunked can be implemented via torch.scan we can potentially get + rid of _moe_forward and _moe_forward_shared and collapse the whole sequence into + the 'forward' method. + """ + # Apply transform for routed experts (e.g., latent projection for latent MoE) hidden_states, shared_experts_input = self.apply_routed_input_transform( hidden_states @@ -599,20 +621,12 @@ def forward_dispatch( ) with self._sequence_parallel_context(): - if self.use_dp_chunking: - return self.forward_impl_chunked( - layer, - hidden_states, - router_logits, - shared_experts_input, - ) - else: - return self.forward_impl( - layer, - hidden_states, - router_logits, - shared_experts_input, - ) + return self.moe_forward_impl( + layer, + hidden_states, + router_logits, + shared_experts_input, + ) def _slice_and_copy_input( self, diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index ad1bac4dbb03..5081311e57df 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -13,6 +13,9 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, ) +from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( + FusedMoEMethodBase, +) from vllm.platforms import current_platform from vllm.utils.torch_utils import ( aux_stream, @@ -41,31 +44,25 @@ class SharedExpertsOrder(IntEnum): AFTER_QUANT_METHOD = (4,) -# XXXXX add method to prime with shared_experts_input? NO class SharedExperts: def __init__( self, shared_experts: torch.nn.Module, moe_config: FusedMoEConfig, - has_separate_shared_experts: bool, # better name - use_dp_chunking: bool, - must_reduce_shared_expert_outputs: bool, # and reduce_results + quant_method: FusedMoEMethodBase, + reduce_results: bool, ): self._output: torch.Tensor | None = None self._shared_experts = shared_experts - self._moe_config = moe_config # invariant - self._use_dp_chunking = use_dp_chunking # invariant - - self._has_separate_shared_experts = has_separate_shared_experts # depends on MK - self._must_reduce_shared_expert_outputs = ( - must_reduce_shared_expert_outputs # depends on MK - ) + self._moe_config = moe_config + self._quant_method = quant_method + self._reduce_results = reduce_results + self._use_dp_chunking = moe_config.moe_parallel_config.use_dp_chunking # Allow disabling of the separate shared experts stream for # debug purposes. # TODO: Remove this after more extensive testings with TP/DP # and other execution modes - self.use_shared_experts_stream = False if envs.VLLM_DISABLE_SHARED_EXPERTS_STREAM: logger.debug_once("Disabling MoE shared_experts cuda stream", scope="local") self._shared_experts_stream = None @@ -78,6 +75,7 @@ def __init__( "Enabled separate cuda stream for MoE shared_experts", scope="local" ) + @property def _has_external_experts(self) -> bool: # Disable shared expert overlap if: # - we are using eplb with non-default backend, because of correctness issues @@ -91,6 +89,22 @@ def _has_external_experts(self) -> bool: or self._moe_config.moe_parallel_config.use_fi_all2allv_kernels ) + # TODO(bnell): better name + @property + def _has_separate_shared_experts(self) -> bool: + return ( + not self._quant_method.mk_owns_shared_expert + and self._shared_experts is not None + ) + + @property + def _must_reduce_shared_expert_outputs(self) -> bool: + return ( + self._reduce_results + and self._quant_method.moe_mk is not None + and self._quant_method.moe_mk.output_is_reduced() + ) + def _determine_shared_experts_order( self, hidden_states: torch.Tensor, @@ -98,7 +112,7 @@ def _determine_shared_experts_order( if self._shared_experts is None: return SharedExpertsOrder.NONE, False - if self._has_external_experts(): + if self._has_external_experts: return SharedExpertsOrder.EXTERNAL, False if ( @@ -127,10 +141,10 @@ def _determine_shared_experts_order( else: return SharedExpertsOrder.AFTER_QUANT_METHOD, allow_shared_experts_stream - def _setup_shared_experts_stream( + def _call_with_shared_experts_stream( self, shared_experts_input: torch.Tensor, - ): + ) -> torch.Tensor: assert self._shared_experts_stream is not None assert self._moe_config.disable_inplace @@ -144,9 +158,21 @@ def _setup_shared_experts_stream( # Mark sync start point for the separate shared experts # stream here since we want to run in parallel with the # router/gate (next op below) - assert self._shared_experts_stream is not None self._shared_experts_stream.wait_stream(current_stream()) + # Run shared experts in parallel on a separate stream + # NOTE: We start the separate stream here and mark the + # sync end point immediately after it is done. This is + # important to avoid excessive stream allocations by the cuda + # graph replay later. + with torch.cuda.stream(self._shared_experts_stream): + # Note that hidden_states clone() is necessary here to avoid + # conflict with the main stream + output = self._shared_experts(shared_experts_input) + current_stream().wait_stream(self._shared_experts_stream) + + return output + def _maybe_reduce_shared_out(self, shared_out: torch.Tensor) -> torch.Tensor: # Reduce shared expert outputs if necessary, since the MLP # should have been created with reduce_results=False. @@ -164,8 +190,6 @@ def output(self) -> torch.Tensor | None: self._output = None return output - # add split/join? - def apply( self, shared_experts_input: torch.Tensor, @@ -174,25 +198,15 @@ def apply( experts_order, use_shared_experts_stream = self._determine_shared_experts_order( shared_experts_input, ) + if order != experts_order: return None assert self._shared_experts is not None assert self._output is None + if order == SharedExpertsOrder.AFTER_QUANT_METHOD and use_shared_experts_stream: - # TODO: fold this in? - self._setup_shared_experts_stream(shared_experts_input) - - # Run shared experts in parallel on a separate stream - # NOTE: We start the separate stream here and mark the - # sync end point immediately after it is done. This is - # important to avoid excessive stream allocations by the cuda - # graph replay later. - with torch.cuda.stream(self._shared_experts_stream): - # Note that hidden_states clone() is necessary here to avoid - # conflict with the main stream - self._output = self._shared_experts(shared_experts_input) - current_stream().wait_stream(self._shared_experts_stream) + self._output = self._call_with_shared_experts_stream(shared_experts_input) else: self._output = self._shared_experts(shared_experts_input) diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index 5f8352faed50..f65a197abcfc 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -94,6 +94,8 @@ def transformers_moe_forward( self = forward_context.no_compile_layers[layer_name] self._topk_ids = topk_ids # Clone hidden_states because it will be mutated in-place in FusedMoE + # TODO(bnell): figure out a way to avoid calling runner directly. + # it is a hack that the weight are being passed via logits. return self.runner.forward(hidden_states.clone(), topk_weights) From f83e0f5d13963d2a1a61dfd9b692420eae416b04 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 23 Feb 2026 18:54:43 -0500 Subject: [PATCH 013/191] fix circular import Signed-off-by: Bill Nell --- .../layers/fused_moe/runner/shared_experts.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index 5081311e57df..305697adf699 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -13,8 +13,8 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, ) -from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( - FusedMoEMethodBase, +from vllm.model_executor.layers.quantization.base_config import ( + QuantizeMethodBase, ) from vllm.platforms import current_platform from vllm.utils.torch_utils import ( @@ -49,9 +49,17 @@ def __init__( self, shared_experts: torch.nn.Module, moe_config: FusedMoEConfig, - quant_method: FusedMoEMethodBase, + quant_method: QuantizeMethodBase, reduce_results: bool, ): + from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( + FusedMoEMethodBase, + ) + + # quant_method must be a FusedMoEMethodBase but we can't use the type + # due to circular imports. + assert isinstance(quant_method, FusedMoEMethodBase) + self._output: torch.Tensor | None = None self._shared_experts = shared_experts self._moe_config = moe_config From 88e80b915d2a9f64bd1792a44ee0239c555c0771 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 24 Feb 2026 16:32:20 -0500 Subject: [PATCH 014/191] fixes Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 2 +- .../layers/fused_moe/runner/default_moe_runner.py | 3 +-- vllm/model_executor/layers/fused_moe/runner/shared_experts.py | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index c8328ec9b63f..67d8af288abf 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -741,7 +741,7 @@ def use_ep(self): @property def is_internal_router(self) -> bool: # By default, router/gate is called before FusedMoE forward pass - return self.gate is not None + return self._gate is not None def _maybe_init_expert_routing_tables( self, diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index 5822fa7bcf26..ac9086444dcf 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -203,12 +203,11 @@ def __init__( # These need to exist ahead of time due to CUDAgraph construction # needing a fixed buffer address. # TODO: these could be global, i.e. shared by all layers + self.use_dp_chunking = self.moe_config.moe_parallel_config.use_dp_chunking self.batched_hidden_states: torch.Tensor | None = None self.batched_router_logits: torch.Tensor | None = None self._maybe_init_dp_chunking() - self.use_dp_chunking = self.moe_config.moe_parallel_config.use_dp_chunking - # Needed for string -> FusedMoE layer lookup in custom ops. self.layer_name = layer.layer_name diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index 305697adf699..76a150bd1241 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -120,7 +120,7 @@ def _determine_shared_experts_order( if self._shared_experts is None: return SharedExpertsOrder.NONE, False - if self._has_external_experts: + if self._has_external_experts and not self._use_dp_chunking: return SharedExpertsOrder.EXTERNAL, False if ( @@ -193,7 +193,7 @@ def _maybe_reduce_shared_out(self, shared_out: torch.Tensor) -> torch.Tensor: @property def output(self) -> torch.Tensor | None: - assert self._shared_experts is None == self._output is None + assert (self._shared_experts is None) == (self._output is None) output = self._output self._output = None return output From 781d4eaf70af1f7e9a0cf9cdbf56cfad4e6bb9c8 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 24 Feb 2026 17:04:50 -0500 Subject: [PATCH 015/191] renames Signed-off-by: Bill Nell --- tools/ep_kernels/install_python_libraries.sh | 1 + .../fused_moe/runner/default_moe_runner.py | 33 ++++++++++--------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh index 3372dd10f4dc..c3deb7d6060c 100755 --- a/tools/ep_kernels/install_python_libraries.sh +++ b/tools/ep_kernels/install_python_libraries.sh @@ -103,6 +103,7 @@ pushd "$WORKSPACE" echo "Downloading NVSHMEM ${NVSHMEM_VER} for ${NVSHMEM_SUBDIR} ..." curl -fSL "${NVSHMEM_URL}" -o "${NVSHMEM_FILE}" tar -xf "${NVSHMEM_FILE}" +rm -rf nvshmem mv "${NVSHMEM_FILE%.tar.xz}" nvshmem rm -f "${NVSHMEM_FILE}" rm -rf nvshmem/lib/bin nvshmem/lib/share diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index ac9086444dcf..2a0117e9c933 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -211,12 +211,14 @@ def __init__( # Needed for string -> FusedMoE layer lookup in custom ops. self.layer_name = layer.layer_name - self.moe_forward, self.moe_forward_impl = self._select_forward(layer) + self.forward_entry, self.forward_impl = self._select_forward(layer) def _select_forward(self, layer: torch.nn.Module) -> tuple[Callable, Callable]: + # Select implementation based on presence of DP chunking. forward_impl_fn = ( - self.forward_impl_chunked if self.use_dp_chunking else self.forward_impl + self._forward_impl_chunked if self.use_dp_chunking else self._forward_impl ) + if current_platform.is_tpu() or current_platform.is_cpu(): # TODO: Once the OOM issue for the TPU backend is resolved, we # will switch to using the moe_forward custom op. @@ -568,19 +570,20 @@ def forward( Calling sequence - forward - - self.moe_forward (_moe_forward or _moe_forward_shared custom op) + - self.forward_entry (_moe_forward or _moe_forward_shared custom op) - forward_dispatch - - moe_forward_impl (forward_impl or forward_impl_chunked) + - forward_impl (_forward_impl or _forward_impl_chunked) Note: The existence of _moe_forward and _moe_forward_shared custom ops are due to the following reasons: - 1. the chunking loop in forward_impl_chunked cannot be compiled by torch.compile - 2. pytorch cannot handle union types in custom op signatures so _moe_forward and - _moe_forward_shared must be split. - - If forward_impl_chunked can be implemented via torch.scan we can potentially get - rid of _moe_forward and _moe_forward_shared and collapse the whole sequence into - the 'forward' method. + 1. the chunking loop in _forward_impl_chunked cannot be compiled by + torch.compile + 2. pytorch cannot handle union types in custom op signatures so _moe_forward + and _moe_forward_shared must be split. + + If _forward_impl_chunked can be implemented via torch.scan we can potentially + get rid of _moe_forward and _moe_forward_shared and collapse the whole sequence + into the 'forward' method. """ # Apply transform for routed experts (e.g., latent projection for latent MoE) @@ -593,7 +596,7 @@ def forward( hidden_states, ) - fused_output = self.moe_forward( + fused_output = self.forward_entry( hidden_states, router_logits, shared_experts_input, @@ -620,7 +623,7 @@ def forward_dispatch( ) with self._sequence_parallel_context(): - return self.moe_forward_impl( + return self.forward_impl( layer, hidden_states, router_logits, @@ -647,7 +650,7 @@ def _slice_and_copy_input( out_slice.copy_(orig_slice, non_blocking=True) return out_slice - def forward_impl_chunked( + def _forward_impl_chunked( self, layer: torch.nn.Module, hidden_states: torch.Tensor, @@ -735,7 +738,7 @@ def forward_impl_chunked( assert final_shared_hidden_states is not None return (final_shared_hidden_states, final_fused_hidden_states) - def forward_impl( + def _forward_impl( self, layer: torch.nn.Module, hidden_states: torch.Tensor, From 369501657ac58a748f1947690b278c0cfc2a455d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 24 Feb 2026 17:23:49 -0500 Subject: [PATCH 016/191] add comment Signed-off-by: Bill Nell --- .../layers/fused_moe/runner/default_moe_runner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index 2a0117e9c933..64fb330d0487 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -77,6 +77,9 @@ def _resolve_layer_name(layer_name: str | ModuleName) -> str: return layer_name.value if isinstance(layer_name, ModuleName) else layer_name +# Note: _moe_forward and _moe_forward_shared should not contain any +# implementation details, They should merely pass along control to +# the runner's 'forward_dispatch' method. def _moe_forward( hidden_states: torch.Tensor, router_logits: torch.Tensor, From 053f66f7cfa0cd59bada3dae156c341420355e07 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 24 Feb 2026 17:26:55 -0500 Subject: [PATCH 017/191] more renames Signed-off-by: Bill Nell --- .../layers/fused_moe/runner/shared_experts.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index 76a150bd1241..45538daff5f8 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -97,9 +97,8 @@ def _has_external_experts(self) -> bool: or self._moe_config.moe_parallel_config.use_fi_all2allv_kernels ) - # TODO(bnell): better name @property - def _has_separate_shared_experts(self) -> bool: + def _has_mk_owned_shared_experts(self) -> bool: return ( not self._quant_method.mk_owns_shared_expert and self._shared_experts is not None @@ -124,14 +123,14 @@ def _determine_shared_experts_order( return SharedExpertsOrder.EXTERNAL, False if ( - not self._has_separate_shared_experts + not self._has_mk_owned_shared_experts or not self._moe_config.moe_parallel_config.use_all2all_kernels ): return SharedExpertsOrder.INTERNAL, False allow_shared_experts_stream = ( current_platform.is_cuda() - and self._has_separate_shared_experts + and self._has_mk_owned_shared_experts and not self._use_dp_chunking and self._shared_experts_stream is not None and hidden_states.shape[0] @@ -141,7 +140,7 @@ def _determine_shared_experts_order( # Check if we need to run shared experts before matrix multiply because # matrix multiply may modify the hidden_states. run_shared_experts_before = ( - self._has_separate_shared_experts and not allow_shared_experts_stream + self._has_mk_owned_shared_experts and not allow_shared_experts_stream ) if run_shared_experts_before: @@ -202,7 +201,7 @@ def apply( self, shared_experts_input: torch.Tensor, order: SharedExpertsOrder, - ) -> torch.Tensor | None: + ): experts_order, use_shared_experts_stream = self._determine_shared_experts_order( shared_experts_input, ) @@ -220,10 +219,8 @@ def apply( if order == SharedExpertsOrder.EXTERNAL: # TODO: figure out how to combine this with maybe_reduce_output? - # or get rid of it completely..... + # or get rid of it completely. assert self._output is not None self._output = self._maybe_reduce_shared_out(self._output) - # TODO: do AFTER reduce here? - - return self._output # ? + # TODO(bnell): potentially do AFTER reduce here insteed of in runner. From 708dd2bbe3089b8c08d6fd6cd0c5c748b06faa05 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 25 Feb 2026 12:55:16 -0500 Subject: [PATCH 018/191] cleanup Signed-off-by: Bill Nell --- vllm/lora/layers/fused_moe.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index b6c62b618a09..11332f4cb228 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -172,7 +172,6 @@ def wrapper(*args, **kwargs): moe_state_dict["apply_router_weight_on_input"] = kwargs[ "apply_router_weight_on_input" ] - # TODO: global_num_experts/shared_experts_input? result = func(*args, **kwargs) return result @@ -597,10 +596,6 @@ def forward(self, *args, **kwargs): def maybe_all_reduce_tensor_model_parallel(self, *args, **kwargs): return self.base_layer.maybe_all_reduce_tensor_model_parallel(*args, **kwargs) - # @property - # def _shared_experts(self): - # return self.base_layer._shared_experts - @property def quant_method(self): return self.base_layer.quant_method From 5748f7c8391e422a7afd12ecac1dbdd0ca444e32 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 25 Feb 2026 19:38:08 -0500 Subject: [PATCH 019/191] remove memoizing router, not needed yet Signed-off-by: Bill Nell --- .../fused_moe/router/memoizing_router.py | 35 ------------------- 1 file changed, 35 deletions(-) delete mode 100644 vllm/model_executor/layers/fused_moe/router/memoizing_router.py diff --git a/vllm/model_executor/layers/fused_moe/router/memoizing_router.py b/vllm/model_executor/layers/fused_moe/router/memoizing_router.py deleted file mode 100644 index a55bd2f09d6f..000000000000 --- a/vllm/model_executor/layers/fused_moe/router/memoizing_router.py +++ /dev/null @@ -1,35 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable - -import torch - -from vllm.model_executor.layers.fused_moe.config import ( - RoutingMethodType, -) -from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter - - -class MemoizingRouter(FusedMoERouter): - def __init__(self, router: FusedMoERouter): - self.router = router - - def set_capture_fn( - self, - capture_fn: Callable[[torch.Tensor], None] | None, - ) -> None: - self.router.set_capture_fn(capture_fn) - self.results: tuple[torch.Tensor, torch.Tensor] | None = None - - @property - def routing_method_type(self) -> RoutingMethodType: - return self.router.routing_method_type - - def select_experts( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - if self.results is None: - self.results = self.router.select_experts(hidden_states, router_logits) - return self.results From 9123f156bdf00f82946cb5a82406f30030b525d2 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 27 Feb 2026 18:33:43 -0500 Subject: [PATCH 020/191] fix UBD bug Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 67d8af288abf..772be9e150bb 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -567,7 +567,7 @@ def __init__( device=vllm_config.device_config.device, routing_method=self.routing_method_type, # TODO: in_dtype == out_dtype? - disable_inplace=disable_inplace() or self.shared_experts is not None, + disable_inplace=disable_inplace() or shared_experts is not None, ) if self.moe_config.use_mori_kernels: assert self.rocm_aiter_fmoe_enabled, ( From 04b430f9986072a02ff1106e07fe68c60d7fdd72 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 5 Mar 2026 16:35:53 -0500 Subject: [PATCH 021/191] cleanup merge Signed-off-by: Bill Nell --- .../layers/fused_moe/runner/default_moe_runner.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index 64fb330d0487..c48744174d76 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -37,8 +37,6 @@ from vllm.utils.torch_utils import ( HAS_OPAQUE_TYPE, ModuleName, - aux_stream, - current_stream, direct_register_custom_op, ) from vllm.v1.worker.ubatching import dbo_current_ubatch_id @@ -86,7 +84,7 @@ def _moe_forward( shared_experts_input: torch.Tensor | None, layer_name: _layer_name_type, ) -> torch.Tensor: - layer = get_layer_from_name(layer_name) + layer = get_layer_from_name(_resolve_layer_name(layer_name)) return layer.runner.forward_dispatch( layer, hidden_states, @@ -110,7 +108,7 @@ def _moe_forward_shared( shared_experts_input: torch.Tensor | None, layer_name: _layer_name_type, ) -> tuple[torch.Tensor, torch.Tensor]: - layer = get_layer_from_name(layer_name) + layer = get_layer_from_name(_resolve_layer_name(layer_name)) return layer.runner.forward_dispatch( layer, hidden_states, @@ -141,7 +139,7 @@ def _moe_forward_shared_fake( direct_register_custom_op( op_name="moe_forward", op_func=_moe_forward, - mutates_args=["hidden_states"], # ? + mutates_args=["hidden_states"], # is this still true? fake_impl=_moe_forward_fake, tags=(torch.Tag.needs_fixed_stride_order,), ) @@ -150,7 +148,6 @@ def _moe_forward_shared_fake( direct_register_custom_op( op_name="moe_forward_shared", op_func=_moe_forward_shared, - mutates_args=["hidden_states"], # ? fake_impl=_moe_forward_shared_fake, tags=(torch.Tag.needs_fixed_stride_order,), ) @@ -202,10 +199,8 @@ def __init__( self.enable_eplb = moe_config.moe_parallel_config.enable_eplb # Chunked all2all staging tensor - # TODO rename these? # These need to exist ahead of time due to CUDAgraph construction # needing a fixed buffer address. - # TODO: these could be global, i.e. shared by all layers self.use_dp_chunking = self.moe_config.moe_parallel_config.use_dp_chunking self.batched_hidden_states: torch.Tensor | None = None self.batched_router_logits: torch.Tensor | None = None From 526db3885a5c9bc4433b36e713c1a8d2ab1abf57 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 5 Mar 2026 16:46:20 -0500 Subject: [PATCH 022/191] fix merge Signed-off-by: Bill Nell --- tests/kernels/moe/utils.py | 2 -- vllm/model_executor/layers/fused_moe/config.py | 2 +- vllm/model_executor/layers/fused_moe/modular_kernel.py | 8 ++------ 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index 4b693d8c8a55..2ef4424c2baa 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -603,7 +603,6 @@ def make_shared_experts( def modular_triton_fused_moe( moe_config: FusedMoEConfig, quant_config: FusedMoEQuantConfig, - shared_experts: torch.nn.Module | None = None, ) -> FusedMoEKernel: return FusedMoEKernel( maybe_make_prepare_finalize( @@ -613,6 +612,5 @@ def modular_triton_fused_moe( use_monolithic=False, ), TritonExperts(moe_config, quant_config), - shared_experts, inplace=False, ) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index cfe50a36cf8e..b3f57e3e376a 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -940,7 +940,7 @@ class FusedMoEParallelConfig: @property def use_dp_chunking(self) -> bool: return ( - or self.use_deepep_ll_kernels + self.use_deepep_ll_kernels or self.use_mori_kernels or self.use_fi_all2allv_kernels or self.use_fi_nvl_two_sided_kernels diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index bbafc682c102..e424d36536b7 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -1001,11 +1001,9 @@ def __init__( self.prepare_finalize = prepare_finalize self.fused_experts = fused_experts self.shared_experts = shared_experts - self.moe_parallel_config = moe_parallel_config self.inplace = inplace - moe_parallel_config = fused_experts.moe_config.moe_parallel_config - self.moe_parallel_config: FusedMoEParallelConfig | None = moe_parallel_config + self.moe_parallel_config = moe_parallel_config self.is_dp_ep = ( moe_parallel_config is not None and moe_parallel_config.dp_size > 1 @@ -1459,8 +1457,7 @@ def __init__( self, prepare_finalize: FusedMoEPrepareAndFinalize, fused_experts: FusedMoEExperts, - shared_experts: torch.nn.Module | None = None, - moe_parallel_config: FusedMoEParallelConfig | None = None, + shared_experts: SharedExperts | None = None, inplace: bool = False, ): super().__init__() @@ -1475,7 +1472,6 @@ def __init__( prepare_finalize, fused_experts, shared_experts, - moe_parallel_config, inplace, ) From 67bdab221527c4d1fd4eb9ac70469d9d0cb39992 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 5 Mar 2026 17:02:59 -0500 Subject: [PATCH 023/191] fix merge Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/runner/shared_experts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index 45538daff5f8..d450f21bda34 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -108,8 +108,8 @@ def _has_mk_owned_shared_experts(self) -> bool: def _must_reduce_shared_expert_outputs(self) -> bool: return ( self._reduce_results - and self._quant_method.moe_mk is not None - and self._quant_method.moe_mk.output_is_reduced() + and self._quant_method.moe_kernel is not None + and self._quant_method.moe_kernel.output_is_reduced() ) def _determine_shared_experts_order( From e9afbe67b72707d32e5ad52b0f819d8fedf116e2 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 5 Mar 2026 17:15:45 -0500 Subject: [PATCH 024/191] fix typos Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 2 +- vllm/model_executor/layers/fused_moe/runner/shared_experts.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 772be9e150bb..02831d7d3f88 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -649,7 +649,7 @@ def _init_shared_experts(self) -> SharedExperts | None: # can property determine where the shared experts are supposed to be # called, i.e. by a MK or by the MoERunner. # Once the MK can be created upfront, we can just pass in the proper - # flags dervied from the quant_method's MK. + # flags derived from the quant_method's MK. reduce_results=self.reduce_results, quant_method=self.quant_method, ) diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index d450f21bda34..134c500ebeea 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -223,4 +223,4 @@ def apply( assert self._output is not None self._output = self._maybe_reduce_shared_out(self._output) - # TODO(bnell): potentially do AFTER reduce here insteed of in runner. + # TODO(bnell): potentially do AFTER reduce here instead of in runner. From 453ab3d4ad5c000719acc35b2bee3c57810529fa Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 18 Mar 2026 17:24:07 +0000 Subject: [PATCH 025/191] fix merge Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/config.py | 1 - vllm/model_executor/layers/fused_moe/runner/shared_experts.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index b3f57e3e376a..83df5e45207b 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -942,7 +942,6 @@ def use_dp_chunking(self) -> bool: return ( self.use_deepep_ll_kernels or self.use_mori_kernels - or self.use_fi_all2allv_kernels or self.use_fi_nvl_two_sided_kernels or self.use_nixl_ep_kernels ) and envs.VLLM_ENABLE_MOE_DP_CHUNK diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index 134c500ebeea..05258999ec8c 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -94,7 +94,7 @@ def _has_external_experts(self) -> bool: self._moe_config.moe_parallel_config.enable_eplb and backend != "allgather_reducescatter" ) - or self._moe_config.moe_parallel_config.use_fi_all2allv_kernels + or self._moe_config.moe_parallel_config.use_fi_nvl_two_sided_kernels ) @property From 48acc59f18b12d4d2bffa4a60cbe3d2beecd0d17 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 18 Mar 2026 18:44:53 +0000 Subject: [PATCH 026/191] fix format Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/oracle/fp8.py | 5 +---- vllm/model_executor/layers/fused_moe/oracle/nvfp4.py | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py index 57431b946e24..59432c79d853 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py +++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py @@ -584,10 +584,7 @@ def make_fp8_moe_kernel( kernel = mk.FusedMoEKernel( prepare_finalize, experts, - shared_experts=( - shared_experts - if prepare_finalize.supports_async() else None - ), + shared_experts=shared_experts if prepare_finalize.supports_async() else None, inplace=( not moe_config.disable_inplace and fp8_backend != Fp8MoeBackend.FLASHINFER_CUTLASS diff --git a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py index 194513981e41..89cacc8ca01e 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py +++ b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py @@ -436,10 +436,7 @@ def make_nvfp4_moe_kernel( kernel = mk.FusedMoEKernel( prepare_finalize, experts, - shared_experts=( - shared_experts - if prepare_finalize.supports_async() else None - ), + shared_experts=shared_experts if prepare_finalize.supports_async() else None, inplace=False, ) From c0678440b0c6923cd205271ec9048fbe57caf007 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 19 Mar 2026 19:11:20 +0000 Subject: [PATCH 027/191] fix gate overlap Signed-off-by: Bill Nell --- .../fused_moe/runner/default_moe_runner.py | 17 +++++--- .../layers/fused_moe/runner/shared_experts.py | 42 ++++++++++++------- 2 files changed, 39 insertions(+), 20 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index c48744174d76..0f56da79a053 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -476,17 +476,22 @@ def _allocate_dp_chunking_outputs( return final_shared_hidden_states, final_fused_hidden_states - def _maybe_gate( + def _maybe_overlap_gate_with_shared_experts( self, hidden_states: torch.Tensor, router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: # If router/gate provided, then apply it here. # (Note: This code runs only when "overlapped mode" is on to allow # parallel execution of shared experts with the FusedMoE via # separate cuda stream) + if self.shared_experts is not None: + self.shared_experts.maybe_setup_shared_experts_stream(shared_experts_input) + if self.gate is not None: router_logits, _ = self.gate(hidden_states) + return router_logits @property @@ -613,7 +618,11 @@ def forward_dispatch( # TODO(bnell): this can be removed after MK migration is complete. layer.ensure_moe_quant_config_init() - router_logits = self._maybe_gate(hidden_states, router_logits) + router_logits = self._maybe_overlap_gate_with_shared_experts( + hidden_states, + router_logits, + shared_experts_input, + ) self._maybe_apply_shared_experts( shared_experts_input, @@ -655,10 +664,6 @@ def _forward_impl_chunked( router_logits: torch.Tensor, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - # Gate overlap not supported when chunking is enabled. Run the - # gate first. - router_logits = self._maybe_gate(hidden_states, router_logits) - final_shared_hidden_states, final_fused_hidden_states = ( self._allocate_dp_chunking_outputs(hidden_states, router_logits) ) diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index 05258999ec8c..3aea8201e509 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -148,24 +148,38 @@ def _determine_shared_experts_order( else: return SharedExpertsOrder.AFTER_QUANT_METHOD, allow_shared_experts_stream + def maybe_setup_shared_experts_stream( + self, + shared_experts_input: torch.Tensor, + ): + experts_order, use_shared_experts_stream = self._determine_shared_experts_order( + shared_experts_input, + ) + + if ( + experts_order == SharedExpertsOrder.AFTER_QUANT_METHOD + and use_shared_experts_stream + ): + assert self._shared_experts_stream is not None + assert self._moe_config.disable_inplace + + # Record that the clone will be used by shared_experts_stream + # to avoid gc issue from deallocation of hidden_states_clone + # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501 + # NOTE: We don't need shared_output.record_stream(current_stream()) + # because we synch the streams before using shared_output. + shared_experts_input.record_stream(self._shared_experts_stream) + + # Mark sync start point for the separate shared experts + # stream here since we want to run in parallel with the + # router/gate (next op below) + self._shared_experts_stream.wait_stream(current_stream()) + def _call_with_shared_experts_stream( self, shared_experts_input: torch.Tensor, ) -> torch.Tensor: - assert self._shared_experts_stream is not None - assert self._moe_config.disable_inplace - - # Record that the clone will be used by shared_experts_stream - # to avoid gc issue from deallocation of hidden_states_clone - # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501 - # NOTE: We don't need shared_output.record_stream(current_stream()) - # because we synch the streams before using shared_output. - shared_experts_input.record_stream(self._shared_experts_stream) - - # Mark sync start point for the separate shared experts - # stream here since we want to run in parallel with the - # router/gate (next op below) - self._shared_experts_stream.wait_stream(current_stream()) + # TODO: assert that maybe_setup_shared_experts_stream has been called. # Run shared experts in parallel on a separate stream # NOTE: We start the separate stream here and mark the From bc8297856810510b9419f2a5360437da34b64330 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 19 Mar 2026 20:47:24 +0000 Subject: [PATCH 028/191] renames, revert lora changes Signed-off-by: Bill Nell --- vllm/lora/layers/fused_moe.py | 4 -- .../layers/fused_moe/runner/shared_experts.py | 39 +++++++++---------- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 11332f4cb228..c4950cd75d07 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -150,11 +150,8 @@ def _inject_lora_into_fused_moe(self): self.base_layer.quant_method.select_gemm_impl( prepare_finalize, self.base_layer ), - self.base_layer.shared_experts, ) - # TODO: could be incorrect due to monolithic kernel? or add assert it - # is modular? if quant_config.use_mxfp4_w4a16: assert isinstance( m_fused_moe_fn.impl.fused_experts, @@ -341,7 +338,6 @@ def wrapper(*args, **kwargs): fused_experts = m_fused_moe_fn.impl.fused_experts - # TODO: seems like this could be done with modular kernel subclasses? m_fused_moe_fn.apply = fwd_decorator(self.base_layer, m_fused_moe_fn.apply) fused_experts.activation = act_decorator( self.base_layer, fused_experts.activation diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index 3aea8201e509..b8c70ac24fae 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -47,7 +47,7 @@ class SharedExpertsOrder(IntEnum): class SharedExperts: def __init__( self, - shared_experts: torch.nn.Module, + layer: torch.nn.Module, moe_config: FusedMoEConfig, quant_method: QuantizeMethodBase, reduce_results: bool, @@ -61,7 +61,7 @@ def __init__( assert isinstance(quant_method, FusedMoEMethodBase) self._output: torch.Tensor | None = None - self._shared_experts = shared_experts + self._layer = layer self._moe_config = moe_config self._quant_method = quant_method self._reduce_results = reduce_results @@ -73,12 +73,12 @@ def __init__( # and other execution modes if envs.VLLM_DISABLE_SHARED_EXPERTS_STREAM: logger.debug_once("Disabling MoE shared_experts cuda stream", scope="local") - self._shared_experts_stream = None + self._stream = None else: # TODO(rob): enable shared expert overlap with non-cuda-alike. # aux_stream() returns None on non-cuda-alike platforms. - self._shared_experts_stream = aux_stream() - if self._shared_experts_stream is not None: + self._stream = aux_stream() + if self._stream is not None: logger.debug_once( "Enabled separate cuda stream for MoE shared_experts", scope="local" ) @@ -89,7 +89,7 @@ def _has_external_experts(self) -> bool: # - we are using eplb with non-default backend, because of correctness issues # - we are using flashinfer with DP, since there nothing to gain backend = self._moe_config.moe_parallel_config.all2all_backend - return self._shared_experts is not None and not ( + return self._layer is not None and not ( ( self._moe_config.moe_parallel_config.enable_eplb and backend != "allgather_reducescatter" @@ -99,10 +99,7 @@ def _has_external_experts(self) -> bool: @property def _has_mk_owned_shared_experts(self) -> bool: - return ( - not self._quant_method.mk_owns_shared_expert - and self._shared_experts is not None - ) + return not self._quant_method.mk_owns_shared_expert and self._layer is not None @property def _must_reduce_shared_expert_outputs(self) -> bool: @@ -116,7 +113,7 @@ def _determine_shared_experts_order( self, hidden_states: torch.Tensor, ) -> tuple[SharedExpertsOrder, bool]: - if self._shared_experts is None: + if self._layer is None: return SharedExpertsOrder.NONE, False if self._has_external_experts and not self._use_dp_chunking: @@ -132,7 +129,7 @@ def _determine_shared_experts_order( current_platform.is_cuda() and self._has_mk_owned_shared_experts and not self._use_dp_chunking - and self._shared_experts_stream is not None + and self._stream is not None and hidden_states.shape[0] <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD ) @@ -160,7 +157,7 @@ def maybe_setup_shared_experts_stream( experts_order == SharedExpertsOrder.AFTER_QUANT_METHOD and use_shared_experts_stream ): - assert self._shared_experts_stream is not None + assert self._stream is not None assert self._moe_config.disable_inplace # Record that the clone will be used by shared_experts_stream @@ -168,12 +165,12 @@ def maybe_setup_shared_experts_stream( # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501 # NOTE: We don't need shared_output.record_stream(current_stream()) # because we synch the streams before using shared_output. - shared_experts_input.record_stream(self._shared_experts_stream) + shared_experts_input.record_stream(self._stream) # Mark sync start point for the separate shared experts # stream here since we want to run in parallel with the # router/gate (next op below) - self._shared_experts_stream.wait_stream(current_stream()) + self._stream.wait_stream(current_stream()) def _call_with_shared_experts_stream( self, @@ -186,11 +183,11 @@ def _call_with_shared_experts_stream( # sync end point immediately after it is done. This is # important to avoid excessive stream allocations by the cuda # graph replay later. - with torch.cuda.stream(self._shared_experts_stream): + with torch.cuda.stream(self._stream): # Note that hidden_states clone() is necessary here to avoid # conflict with the main stream - output = self._shared_experts(shared_experts_input) - current_stream().wait_stream(self._shared_experts_stream) + output = self._layer(shared_experts_input) + current_stream().wait_stream(self._stream) return output @@ -206,7 +203,7 @@ def _maybe_reduce_shared_out(self, shared_out: torch.Tensor) -> torch.Tensor: @property def output(self) -> torch.Tensor | None: - assert (self._shared_experts is None) == (self._output is None) + assert (self._layer is None) == (self._output is None) output = self._output self._output = None return output @@ -223,13 +220,13 @@ def apply( if order != experts_order: return None - assert self._shared_experts is not None + assert self._layer is not None assert self._output is None if order == SharedExpertsOrder.AFTER_QUANT_METHOD and use_shared_experts_stream: self._output = self._call_with_shared_experts_stream(shared_experts_input) else: - self._output = self._shared_experts(shared_experts_input) + self._output = self._layer(shared_experts_input) if order == SharedExpertsOrder.EXTERNAL: # TODO: figure out how to combine this with maybe_reduce_output? From 3dc9d4f6a24b65526acba8ec554e7cfdb4380834 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 20 Mar 2026 17:34:42 +0000 Subject: [PATCH 029/191] review comments + cleanup Signed-off-by: Bill Nell --- .../layers/fused_moe/fused_moe_method_base.py | 4 +- vllm/model_executor/layers/fused_moe/layer.py | 45 +++++++------------ .../layers/fused_moe/modular_kernel.py | 17 +++++-- .../layers/fused_moe/oracle/fp8.py | 2 +- .../layers/fused_moe/oracle/nvfp4.py | 2 +- .../fused_moe/runner/default_moe_runner.py | 22 +++++++-- .../layers/fused_moe/runner/moe_runner.py | 4 ++ .../layers/fused_moe/runner/shared_experts.py | 32 +++---------- 8 files changed, 62 insertions(+), 66 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py index 9350a9c7de74..8f40a63fa211 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py @@ -39,9 +39,7 @@ def supports_internal_mk(self) -> bool: def mk_owns_shared_expert(self) -> bool: # NOTE(rob): temporary attribute to indicate support for # completed migration to the new internal MK interface. - return ( - self.moe_kernel is not None and self.moe_kernel.shared_experts is not None - ) + return self.moe_kernel is not None and self.moe_kernel.owns_shared_experts @abstractmethod def create_weights( diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 02831d7d3f88..4ed84a17fa8c 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -42,9 +42,6 @@ from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import ( DefaultMoERunner, ) -from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( - SharedExperts, -) from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( UnquantizedFusedMoEMethod, ) @@ -339,7 +336,6 @@ def __init__( ): super().__init__() - self._gate = gate self._routed_input_transform = routed_input_transform if params_dtype is None: @@ -632,29 +628,17 @@ def _get_quant_method() -> FusedMoEMethodBase: moe_quant_params["intermediate_size_full"] = intermediate_size self.quant_method.create_weights(layer=self, **moe_quant_params) - self.base_quant_method = self.quant_method - self._shared_experts = shared_experts - self.shared_experts = self._init_shared_experts() - self.runner = self._init_runner() + # TODO(bnell): Why is this needed? Can probably be removed. + self.base_quant_method = self.quant_method - def _init_shared_experts(self) -> SharedExperts | None: - if self._shared_experts is None: - return None + self.runner = self._init_runner(gate, shared_experts) - return SharedExperts( - self._shared_experts, - moe_config=self.moe_config, - # Note: For now we must pass quant_method along to SharedExperts so it - # can property determine where the shared experts are supposed to be - # called, i.e. by a MK or by the MoERunner. - # Once the MK can be created upfront, we can just pass in the proper - # flags derived from the quant_method's MK. - reduce_results=self.reduce_results, - quant_method=self.quant_method, - ) - - def _init_runner(self) -> DefaultMoERunner: + def _init_runner( + self, + gate: torch.nn.Module | None, + shared_experts: torch.nn.Module | None, + ) -> DefaultMoERunner: # Storing the runner in the FusedMoE is an intermediate state, eventually # the runner will own the FusedMoE layer and provide the execution interface # for MoE ops. @@ -663,8 +647,8 @@ def _init_runner(self) -> DefaultMoERunner: moe_config=self.moe_config, router=self.router, routed_input_transform=self._routed_input_transform, - gate=self._gate, - shared_experts=self.shared_experts, + gate=gate, + shared_experts=shared_experts, quant_method=self.quant_method, reduce_results=self.reduce_results, enable_dbo=self.vllm_config.parallel_config.enable_dbo, @@ -678,7 +662,10 @@ def _replace_quant_method(self, mk: FusedMoEMethodBase): # We need to force reconstruction of runner because we're swapping out # the quant_method with a FusedMoEModularMethod. This logic can go # away once the FusedMoEModularMethod is eliminated. - self.runner = self._init_runner() + self.runner = self._init_runner( + self.runner.gate, + self.runner.shared_experts, + ) # Note: maybe_init_modular_kernel should only be called by # prepare_communication_buffer_for_model. @@ -706,7 +693,7 @@ def maybe_init_modular_kernel(self) -> None: self, self.base_quant_method, prepare_finalize, - self.shared_experts, + self.runner.shared_experts, inplace=not self.moe_config.disable_inplace, ) ) @@ -741,7 +728,7 @@ def use_ep(self): @property def is_internal_router(self) -> bool: # By default, router/gate is called before FusedMoE forward pass - return self._gate is not None + return self.runner.is_internal_router() def _maybe_init_expert_routing_tables( self, diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index e424d36536b7..d6e40b69eff2 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -1000,7 +1000,10 @@ def __init__( ): self.prepare_finalize = prepare_finalize self.fused_experts = fused_experts - self.shared_experts = shared_experts + # Only accept shared experts if they can be run w/async. + self.shared_experts = ( + shared_experts if prepare_finalize.supports_async() else None + ) self.inplace = inplace moe_parallel_config = fused_experts.moe_config.moe_parallel_config self.moe_parallel_config = moe_parallel_config @@ -1277,7 +1280,8 @@ def _finalize( apply_router_weight_on_input, self.fused_experts.finalize_weight_and_reduce_impl(), ) - self._maybe_apply_shared_experts(shared_experts_input) + # TODO: remove + # self._maybe_apply_shared_experts(shared_experts_input) # TODO(lucas): refactor this in the alternative schedules followup # currently unpack if we have hook + receiver pair or just @@ -1461,7 +1465,6 @@ def __init__( inplace: bool = False, ): super().__init__() - self.shared_experts = shared_experts # NOTE: check if we can remove # Initialize the implementation (monolithic or modular). self.impl: FusedMoEKernelModularImpl | FusedMoEKernelMonolithicImpl @@ -1478,7 +1481,6 @@ def __init__( elif isinstance( prepare_finalize, FusedMoEPrepareAndFinalizeMonolithic ) and isinstance(fused_experts, FusedMoEExpertsMonolithic): - assert shared_experts is None assert not inplace self.impl = FusedMoEKernelMonolithicImpl( prepare_finalize, @@ -1494,6 +1496,13 @@ def __init__( self._post_init_setup() + @property + def owns_shared_experts(self) -> bool: + if isinstance(self.impl, FusedMoEKernelModularImpl): + return self.impl.shared_experts is not None + else: + return False + @property def is_monolithic(self) -> bool: return isinstance(self.impl, FusedMoEKernelMonolithicImpl) diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py index 59432c79d853..3d9a499027cd 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py +++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py @@ -584,7 +584,7 @@ def make_fp8_moe_kernel( kernel = mk.FusedMoEKernel( prepare_finalize, experts, - shared_experts=shared_experts if prepare_finalize.supports_async() else None, + shared_experts=shared_experts, inplace=( not moe_config.disable_inplace and fp8_backend != Fp8MoeBackend.FLASHINFER_CUTLASS diff --git a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py index 89cacc8ca01e..940423694673 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py +++ b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py @@ -436,7 +436,7 @@ def make_nvfp4_moe_kernel( kernel = mk.FusedMoEKernel( prepare_finalize, experts, - shared_experts=shared_experts if prepare_finalize.supports_async() else None, + shared_experts=shared_experts, inplace=False, ) diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index 0f56da79a053..db7378f2e1b7 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -182,7 +182,7 @@ def __init__( router: FusedMoERouter, routed_input_transform: torch.nn.Module | None, gate: torch.nn.Module | None, - shared_experts: SharedExperts | None, + shared_experts: torch.nn.Module | None, quant_method: FusedMoEMethodBase, reduce_results: bool, enable_dbo: bool, @@ -192,7 +192,21 @@ def __init__( self.router = router self.routed_input_transform = routed_input_transform self.gate = gate - self.shared_experts = shared_experts + + self.shared_experts: SharedExperts | None = None + if shared_experts is not None: + self.shared_experts = SharedExperts( + shared_experts, + moe_config=moe_config, + # Note: For now we must pass quant_method along to SharedExperts so it + # can property determine where the shared experts are supposed to be + # called, i.e. by a MK or by the MoERunner. + # Once the MK can be created upfront, we can just pass in the proper + # flags derived from the quant_method's MK. + reduce_results=reduce_results, + quant_method=quant_method, + ) + self.quant_method = quant_method self.reduce_results = reduce_results self.enable_dbo = enable_dbo @@ -233,6 +247,9 @@ def _select_forward(self, layer: torch.nn.Module) -> tuple[Callable, Callable]: forward_impl_fn, ) + def is_internal_router(self) -> bool: + return self.gate is not None + def _maybe_init_dp_chunking(self): if not self.use_dp_chunking: return @@ -547,7 +564,6 @@ def _maybe_combine( hidden_states, dim=0, ) - # need RS for shared_output? if self.shared_experts is not None: assert shared_output is not None diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index b298cc2d0c4c..720e997cda36 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -32,3 +32,7 @@ def maybe_all_reduce_tensor_model_parallel( final_hidden_states: torch.Tensor, ): raise NotImplementedError + + @abstractmethod + def is_internal_router(self) -> bool: + raise NotImplementedError diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index b8c70ac24fae..4d36a700d8f9 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -89,7 +89,7 @@ def _has_external_experts(self) -> bool: # - we are using eplb with non-default backend, because of correctness issues # - we are using flashinfer with DP, since there nothing to gain backend = self._moe_config.moe_parallel_config.all2all_backend - return self._layer is not None and not ( + return not ( ( self._moe_config.moe_parallel_config.enable_eplb and backend != "allgather_reducescatter" @@ -97,10 +97,6 @@ def _has_external_experts(self) -> bool: or self._moe_config.moe_parallel_config.use_fi_nvl_two_sided_kernels ) - @property - def _has_mk_owned_shared_experts(self) -> bool: - return not self._quant_method.mk_owns_shared_expert and self._layer is not None - @property def _must_reduce_shared_expert_outputs(self) -> bool: return ( @@ -113,37 +109,24 @@ def _determine_shared_experts_order( self, hidden_states: torch.Tensor, ) -> tuple[SharedExpertsOrder, bool]: - if self._layer is None: - return SharedExpertsOrder.NONE, False - if self._has_external_experts and not self._use_dp_chunking: return SharedExpertsOrder.EXTERNAL, False - if ( - not self._has_mk_owned_shared_experts - or not self._moe_config.moe_parallel_config.use_all2all_kernels - ): + if self._quant_method.mk_owns_shared_expert: return SharedExpertsOrder.INTERNAL, False allow_shared_experts_stream = ( current_platform.is_cuda() - and self._has_mk_owned_shared_experts and not self._use_dp_chunking and self._stream is not None and hidden_states.shape[0] <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD ) - # Check if we need to run shared experts before matrix multiply because - # matrix multiply may modify the hidden_states. - run_shared_experts_before = ( - self._has_mk_owned_shared_experts and not allow_shared_experts_stream - ) - - if run_shared_experts_before: - return SharedExpertsOrder.BEFORE_QUANT_METHOD, False + if allow_shared_experts_stream: + return SharedExpertsOrder.AFTER_QUANT_METHOD, True else: - return SharedExpertsOrder.AFTER_QUANT_METHOD, allow_shared_experts_stream + return SharedExpertsOrder.BEFORE_QUANT_METHOD, False def maybe_setup_shared_experts_stream( self, @@ -202,8 +185,8 @@ def _maybe_reduce_shared_out(self, shared_out: torch.Tensor) -> torch.Tensor: return shared_out @property - def output(self) -> torch.Tensor | None: - assert (self._layer is None) == (self._output is None) + def output(self) -> torch.Tensor: + assert self._output is not None output = self._output self._output = None return output @@ -220,7 +203,6 @@ def apply( if order != experts_order: return None - assert self._layer is not None assert self._output is None if order == SharedExpertsOrder.AFTER_QUANT_METHOD and use_shared_experts_stream: From 12bda3d877e68d9fde63aafdbcf0432f0bf4a1d6 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 20 Mar 2026 17:39:43 +0000 Subject: [PATCH 030/191] remove _must_reduce_shared_expert_outputs Signed-off-by: Bill Nell --- .../layers/fused_moe/runner/shared_experts.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index 4d36a700d8f9..3aafb0eb925b 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -97,14 +97,6 @@ def _has_external_experts(self) -> bool: or self._moe_config.moe_parallel_config.use_fi_nvl_two_sided_kernels ) - @property - def _must_reduce_shared_expert_outputs(self) -> bool: - return ( - self._reduce_results - and self._quant_method.moe_kernel is not None - and self._quant_method.moe_kernel.output_is_reduced() - ) - def _determine_shared_experts_order( self, hidden_states: torch.Tensor, @@ -178,7 +170,9 @@ def _maybe_reduce_shared_out(self, shared_out: torch.Tensor) -> torch.Tensor: # Reduce shared expert outputs if necessary, since the MLP # should have been created with reduce_results=False. if ( - self._must_reduce_shared_expert_outputs + self._reduce_results + and self._quant_method.moe_kernel is not None + and self._quant_method.moe_kernel.output_is_reduced() and get_tensor_model_parallel_world_size() > 1 ): shared_out = tensor_model_parallel_all_reduce(shared_out) From 8aaddea26bf3f3cbc0a483ac5df48ead75f5c69f Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 23 Mar 2026 17:38:19 +0000 Subject: [PATCH 031/191] undo some changes + add Rob's changes Signed-off-by: Bill Nell --- .../fused_moe/deepep_ll_prepare_finalize.py | 2 +- vllm/model_executor/layers/fused_moe/layer.py | 58 ++++++++++++++----- .../layers/fused_moe/modular_kernel.py | 8 +-- .../fused_moe/runner/default_moe_runner.py | 25 ++------ .../layers/fused_moe/runner/shared_experts.py | 47 +++++++-------- 5 files changed, 75 insertions(+), 65 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py index a22b89415364..5bd3f627ab1f 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -306,7 +306,7 @@ def prepare_async( **(dict(use_nvfp4=True) if use_nvfp4 else dict()), **( dict(x_global_scale=qc_a1_gscale_or_scale) - if qc_a1_gscale_or_scale is not None + if qc_a1_gscale_or_scale is not None and nvfp4_dispatch else dict() ), async_finish=False, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 4ed84a17fa8c..456643c8c39e 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -42,6 +42,9 @@ from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import ( DefaultMoERunner, ) +from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( + SharedExperts, +) from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( UnquantizedFusedMoEMethod, ) @@ -632,23 +635,43 @@ def _get_quant_method() -> FusedMoEMethodBase: # TODO(bnell): Why is this needed? Can probably be removed. self.base_quant_method = self.quant_method - self.runner = self._init_runner(gate, shared_experts) + # Note: for now, the layer must keep _gate and _shared_experts. + # This is because a number of locations swap out the quant_method + # which requires re-initializing the SharedExperts and DefaultMoERunner. + # Once we've figured out alternatives to swapping out the quant_method, + # we can move ownership of _gate and _shared_experts into the runner. + self._gate = gate + self._shared_experts = shared_experts + self.runner = self._init_runner() + + def _init_shared_experts(self) -> SharedExperts | None: + if self._shared_experts is None: + return None - def _init_runner( - self, - gate: torch.nn.Module | None, - shared_experts: torch.nn.Module | None, - ) -> DefaultMoERunner: + return SharedExperts( + self._shared_experts, + moe_config=self.moe_config, + # Note: For now we must pass quant_method along to SharedExperts so it + # can property determine where the shared experts are supposed to be + # called, i.e. by a MK or by the MoERunner. + # Once the MK can be created upfront, we can just pass in the proper + # flags derived from the quant_method's MK. + reduce_results=self.reduce_results, + quant_method=self.quant_method, + ) + + def _init_runner(self) -> DefaultMoERunner: # Storing the runner in the FusedMoE is an intermediate state, eventually # the runner will own the FusedMoE layer and provide the execution interface # for MoE ops. + self.shared_experts = self._init_shared_experts() return DefaultMoERunner( layer=self, moe_config=self.moe_config, router=self.router, routed_input_transform=self._routed_input_transform, - gate=gate, - shared_experts=shared_experts, + gate=self._gate, + shared_experts=self.shared_experts, quant_method=self.quant_method, reduce_results=self.reduce_results, enable_dbo=self.vllm_config.parallel_config.enable_dbo, @@ -662,10 +685,7 @@ def _replace_quant_method(self, mk: FusedMoEMethodBase): # We need to force reconstruction of runner because we're swapping out # the quant_method with a FusedMoEModularMethod. This logic can go # away once the FusedMoEModularMethod is eliminated. - self.runner = self._init_runner( - self.runner.gate, - self.runner.shared_experts, - ) + self.runner = self._init_runner() # Note: maybe_init_modular_kernel should only be called by # prepare_communication_buffer_for_model. @@ -693,7 +713,7 @@ def maybe_init_modular_kernel(self) -> None: self, self.base_quant_method, prepare_finalize, - self.runner.shared_experts, + self.shared_experts, inplace=not self.moe_config.disable_inplace, ) ) @@ -1422,7 +1442,12 @@ def _maybe_make_contiguous( assert all( weight.is_contiguous() for name, weight in weights - if not (name.startswith("_shared_experts.") or name.startswith("_gate.")) + if not ( + name.startswith("_shared_experts.") + or name.startswith("_gate.") + or name.startswith("_routed_input_transform.") + or name.startswith("_routed_output_transform.") + ) and name not in NON_EXPERT_WEIGHTS ) @@ -1432,8 +1457,11 @@ def _maybe_make_contiguous( if name not in NON_EXPERT_WEIGHTS and weight.shape != torch.Size([]) and not name.startswith("_shared_experts.") - # exclude parameters from non-expert submodules (e.g. gate/shared) + # exclude parameters from non-expert submodules, + # e.g. gate/shared/transforms. and not name.startswith("_gate.") + and not name.startswith("_routed_input_transform.") + and not name.startswith("_routed_output_transform.") ] def set_eplb_state( diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index d6e40b69eff2..870d24a3c479 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -1001,6 +1001,8 @@ def __init__( self.prepare_finalize = prepare_finalize self.fused_experts = fused_experts # Only accept shared experts if they can be run w/async. + # The MoERunner/SharedExperts class will coordinate with the MK to ensure + # that the SharedExperts are executed only once. self.shared_experts = ( shared_experts if prepare_finalize.supports_async() else None ) @@ -1084,7 +1086,7 @@ def _maybe_apply_shared_experts( assert shared_experts_input is not None self.shared_experts.apply( shared_experts_input, - SharedExpertsOrder.INTERNAL, + SharedExpertsOrder.MK_INTERNAL_OVERLAPPED, ) def _prepare( @@ -1270,7 +1272,6 @@ def _finalize( apply_router_weight_on_input, self.fused_experts.finalize_weight_and_reduce_impl(), ) - self._maybe_apply_shared_experts(shared_experts_input) else: finalize_ret = self.prepare_finalize.finalize_async( output, @@ -1280,8 +1281,7 @@ def _finalize( apply_router_weight_on_input, self.fused_experts.finalize_weight_and_reduce_impl(), ) - # TODO: remove - # self._maybe_apply_shared_experts(shared_experts_input) + self._maybe_apply_shared_experts(shared_experts_input) # TODO(lucas): refactor this in the alternative schedules followup # currently unpack if we have hook + receiver pair or just diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index db7378f2e1b7..41a5c69e60d1 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -182,7 +182,7 @@ def __init__( router: FusedMoERouter, routed_input_transform: torch.nn.Module | None, gate: torch.nn.Module | None, - shared_experts: torch.nn.Module | None, + shared_experts: SharedExperts | None, quant_method: FusedMoEMethodBase, reduce_results: bool, enable_dbo: bool, @@ -192,21 +192,7 @@ def __init__( self.router = router self.routed_input_transform = routed_input_transform self.gate = gate - - self.shared_experts: SharedExperts | None = None - if shared_experts is not None: - self.shared_experts = SharedExperts( - shared_experts, - moe_config=moe_config, - # Note: For now we must pass quant_method along to SharedExperts so it - # can property determine where the shared experts are supposed to be - # called, i.e. by a MK or by the MoERunner. - # Once the MK can be created upfront, we can just pass in the proper - # flags derived from the quant_method's MK. - reduce_results=reduce_results, - quant_method=quant_method, - ) - + self.shared_experts = shared_experts self.quant_method = quant_method self.reduce_results = reduce_results self.enable_dbo = enable_dbo @@ -420,9 +406,10 @@ def _apply_quant_method( shared_experts_input: torch.Tensor | None, ) -> tuple[torch.Tensor | None, torch.Tensor]: # Run this before quant_method to avoid inplace issues. + # TODO(bnell): probably not needed anymore since inplace is + # disabled when shared experts are present. self._maybe_apply_shared_experts( - shared_experts_input, - SharedExpertsOrder.BEFORE_QUANT_METHOD, + shared_experts_input, SharedExpertsOrder.NO_OVERLAP ) if self.quant_method.is_monolithic: @@ -447,7 +434,7 @@ def _apply_quant_method( self._maybe_apply_shared_experts( shared_experts_input, - SharedExpertsOrder.AFTER_QUANT_METHOD, + SharedExpertsOrder.MULTI_STREAM_OVERLAPPED, ) return ( diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index 3aafb0eb925b..c4333bcf9f19 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -34,14 +34,14 @@ class SharedExpertsOrder(IntEnum): # get rid of it after _moe_forward is undone. EXTERNAL = (1,) - # Called by modular kernel. - INTERNAL = (2,) + # No overlap - defensively called before MK. + NO_OVERLAP = (2,) - # Called right before quant_method is executed. - BEFORE_QUANT_METHOD = (3,) + # Overlapped with dispatch/combine in DP/EP - called by the MK. + MK_INTERNAL_OVERLAPPED = (3,) - # Called right after quant_method is executed (possibly with streaming). - AFTER_QUANT_METHOD = (4,) + # Overlapped with the gate, router, experts in aux stream. + MULTI_STREAM_OVERLAPPED = (4,) class SharedExperts: @@ -100,14 +100,14 @@ def _has_external_experts(self) -> bool: def _determine_shared_experts_order( self, hidden_states: torch.Tensor, - ) -> tuple[SharedExpertsOrder, bool]: + ) -> SharedExpertsOrder: if self._has_external_experts and not self._use_dp_chunking: - return SharedExpertsOrder.EXTERNAL, False + return SharedExpertsOrder.EXTERNAL if self._quant_method.mk_owns_shared_expert: - return SharedExpertsOrder.INTERNAL, False + return SharedExpertsOrder.MK_INTERNAL_OVERLAPPED - allow_shared_experts_stream = ( + should_run_shared_in_aux_stream = ( current_platform.is_cuda() and not self._use_dp_chunking and self._stream is not None @@ -115,23 +115,18 @@ def _determine_shared_experts_order( <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD ) - if allow_shared_experts_stream: - return SharedExpertsOrder.AFTER_QUANT_METHOD, True + if should_run_shared_in_aux_stream: + return SharedExpertsOrder.MULTI_STREAM_OVERLAPPED else: - return SharedExpertsOrder.BEFORE_QUANT_METHOD, False + return SharedExpertsOrder.NO_OVERLAP def maybe_setup_shared_experts_stream( self, shared_experts_input: torch.Tensor, ): - experts_order, use_shared_experts_stream = self._determine_shared_experts_order( - shared_experts_input, - ) + experts_order = self._determine_shared_experts_order(shared_experts_input) - if ( - experts_order == SharedExpertsOrder.AFTER_QUANT_METHOD - and use_shared_experts_stream - ): + if experts_order == SharedExpertsOrder.MULTI_STREAM_OVERLAPPED: assert self._stream is not None assert self._moe_config.disable_inplace @@ -147,7 +142,7 @@ def maybe_setup_shared_experts_stream( # router/gate (next op below) self._stream.wait_stream(current_stream()) - def _call_with_shared_experts_stream( + def _run_in_aux_stream( self, shared_experts_input: torch.Tensor, ) -> torch.Tensor: @@ -190,17 +185,15 @@ def apply( shared_experts_input: torch.Tensor, order: SharedExpertsOrder, ): - experts_order, use_shared_experts_stream = self._determine_shared_experts_order( - shared_experts_input, - ) + experts_order = self._determine_shared_experts_order(shared_experts_input) if order != experts_order: return None assert self._output is None - if order == SharedExpertsOrder.AFTER_QUANT_METHOD and use_shared_experts_stream: - self._output = self._call_with_shared_experts_stream(shared_experts_input) + if order == SharedExpertsOrder.MULTI_STREAM_OVERLAPPED: + self._output = self._run_in_aux_stream(shared_experts_input) else: self._output = self._layer(shared_experts_input) @@ -210,4 +203,6 @@ def apply( assert self._output is not None self._output = self._maybe_reduce_shared_out(self._output) + assert self._output is not None + # TODO(bnell): potentially do AFTER reduce here instead of in runner. From 392f311bb697518d1e15bd9f0f4ffcd1e83358e1 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 23 Mar 2026 18:59:52 +0000 Subject: [PATCH 032/191] hacky fix for unquantized method Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 30 ++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index c312ca3bd5a2..aacdc19dad90 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -614,13 +614,35 @@ def _get_quant_method() -> FusedMoEMethodBase: # we can move ownership of _gate and _shared_experts into the runner. self._gate = gate self._shared_experts = shared_experts + self.shared_experts = None self.runner = self._init_runner() - def _init_shared_experts(self) -> SharedExperts | None: + def _init_shared_experts(self): # -> SharedExperts | None: if self._shared_experts is None: - return None + return + + # Note: If the SharedExperts already exist, we reinitialize + # them in place. This is because the MK might be holding a + # reference to the same SharedExperts object. If we create a + # new instance, the MK will still be holding onto the old one, + # including the old quant_method. This is a workaround for + # UnquantizedFusedMoEMethod's handling of MK initialization + # which should be fixed by #36732. + if self.shared_experts is not None: + self.shared_experts.__init__( + self._shared_experts, + moe_config=self.moe_config, + # Note: For now we must pass quant_method along to SharedExperts so it + # can property determine where the shared experts are supposed to be + # called, i.e. by a MK or by the MoERunner. + # Once the MK can be created upfront, we can just pass in the proper + # flags derived from the quant_method's MK. + reduce_results=self.reduce_results, + quant_method=self.quant_method, + ) + return - return SharedExperts( + self.shared_experts = SharedExperts( self._shared_experts, moe_config=self.moe_config, # Note: For now we must pass quant_method along to SharedExperts so it @@ -636,7 +658,7 @@ def _init_runner(self) -> DefaultMoERunner: # Storing the runner in the FusedMoE is an intermediate state, eventually # the runner will own the FusedMoE layer and provide the execution interface # for MoE ops. - self.shared_experts = self._init_shared_experts() + self._init_shared_experts() return DefaultMoERunner( layer=self, moe_config=self.moe_config, From 7d5adbec163461b7e2323860f609c08f40ce3aa9 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 23 Mar 2026 22:02:18 +0000 Subject: [PATCH 033/191] fix lint Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 4 ++-- vllm/model_executor/layers/fused_moe/oracle/mxfp4.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index aacdc19dad90..32802c2239de 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -614,7 +614,7 @@ def _get_quant_method() -> FusedMoEMethodBase: # we can move ownership of _gate and _shared_experts into the runner. self._gate = gate self._shared_experts = shared_experts - self.shared_experts = None + self.shared_experts: SharedExperts | None = None self.runner = self._init_runner() def _init_shared_experts(self): # -> SharedExperts | None: @@ -629,7 +629,7 @@ def _init_shared_experts(self): # -> SharedExperts | None: # UnquantizedFusedMoEMethod's handling of MK initialization # which should be fixed by #36732. if self.shared_experts is not None: - self.shared_experts.__init__( + self.shared_experts.__init__( # type: ignore self._shared_experts, moe_config=self.moe_config, # Note: For now we must pass quant_method along to SharedExperts so it diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py index 9d1c8e27b7f5..77a53a6c3e5f 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py +++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py @@ -859,7 +859,6 @@ def make_mxfp4_moe_kernel( if moe_config.moe_parallel_config.use_deepep_ll_kernels else None ), - moe_parallel_config=moe_config.moe_parallel_config, inplace=( not moe_config.disable_inplace and mxfp4_backend not in TRTLLM_BACKENDS ), From f3451658df968639419c3445d6be4ab9ad4fbadf Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 12 Feb 2026 18:03:08 -0500 Subject: [PATCH 034/191] fix lint Signed-off-by: Bill Nell --- vllm/lora/layers/fused_moe.py | 4 + vllm/model_executor/layers/fused_moe/layer.py | 14 +- .../layers/fused_moe/modular_kernel.py | 2 +- .../fused_moe/router/memoizing_router.py | 35 + .../fused_moe/runner/chunking_moe_runner.py | 219 ++++++ .../fused_moe/runner/default_moe_runner.py | 665 +----------------- .../fused_moe/runner/moe_runner_base.py | 501 +++++++++++++ .../fused_moe/runner/moe_runner_factory.py | 51 ++ 8 files changed, 853 insertions(+), 638 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/router/memoizing_router.py create mode 100644 vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py create mode 100644 vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py create mode 100644 vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index c4950cd75d07..11332f4cb228 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -150,8 +150,11 @@ def _inject_lora_into_fused_moe(self): self.base_layer.quant_method.select_gemm_impl( prepare_finalize, self.base_layer ), + self.base_layer.shared_experts, ) + # TODO: could be incorrect due to monolithic kernel? or add assert it + # is modular? if quant_config.use_mxfp4_w4a16: assert isinstance( m_fused_moe_fn.impl.fused_experts, @@ -338,6 +341,7 @@ def wrapper(*args, **kwargs): fused_experts = m_fused_moe_fn.impl.fused_experts + # TODO: seems like this could be done with modular kernel subclasses? m_fused_moe_fn.apply = fwd_decorator(self.base_layer, m_fused_moe_fn.apply) fused_experts.activation = act_decorator( self.base_layer, fused_experts.activation diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 32802c2239de..5a83d5138168 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -39,8 +39,14 @@ from vllm.model_executor.layers.fused_moe.router.router_factory import ( create_fused_moe_router, ) -from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import ( - DefaultMoERunner, +from vllm.model_executor.layers.fused_moe.runner.moe_runner import ( + MoERunner, +) +from vllm.model_executor.layers.fused_moe.runner.moe_runner_factory import ( + create_moe_runner, +) +from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( + SharedExperts, ) from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( SharedExperts, @@ -654,12 +660,12 @@ def _init_shared_experts(self): # -> SharedExperts | None: quant_method=self.quant_method, ) - def _init_runner(self) -> DefaultMoERunner: + def _init_runner(self) -> MoERunner: # Storing the runner in the FusedMoE is an intermediate state, eventually # the runner will own the FusedMoE layer and provide the execution interface # for MoE ops. self._init_shared_experts() - return DefaultMoERunner( + return create_moe_runner( layer=self, moe_config=self.moe_config, router=self.router, diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 870d24a3c479..f51c7fa53b43 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -995,7 +995,7 @@ def __init__( self, prepare_finalize: FusedMoEPrepareAndFinalizeModular, fused_experts: FusedMoEExpertsModular, - shared_experts: SharedExperts | None, + shared_experts: SharedExperts | None = None, inplace: bool = False, ): self.prepare_finalize = prepare_finalize diff --git a/vllm/model_executor/layers/fused_moe/router/memoizing_router.py b/vllm/model_executor/layers/fused_moe/router/memoizing_router.py new file mode 100644 index 000000000000..a55bd2f09d6f --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/router/memoizing_router.py @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable + +import torch + +from vllm.model_executor.layers.fused_moe.config import ( + RoutingMethodType, +) +from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter + + +class MemoizingRouter(FusedMoERouter): + def __init__(self, router: FusedMoERouter): + self.router = router + + def set_capture_fn( + self, + capture_fn: Callable[[torch.Tensor], None] | None, + ) -> None: + self.router.set_capture_fn(capture_fn) + self.results: tuple[torch.Tensor, torch.Tensor] | None = None + + @property + def routing_method_type(self) -> RoutingMethodType: + return self.router.routing_method_type + + def select_experts( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + if self.results is None: + self.results = self.router.select_experts(hidden_states, router_logits) + return self.results diff --git a/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py new file mode 100644 index 000000000000..6626fad557b3 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py @@ -0,0 +1,219 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.forward_context import ( + get_forward_context, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.runner.moe_runner_base import MoERunnerBase +from vllm.utils.math_utils import cdiv +from vllm.v1.worker.ubatching import dbo_current_ubatch_id +from vllm.v1.worker.workspace import current_workspace_manager + +logger = init_logger(__name__) + + +class ChunkingMoERunner(MoERunnerBase): + """ + MoE runner wrapper that adds chunked processing to any MoERunnerBase. + + This runner wraps an inner MoERunnerBase and overrides _forward_impl to + process large batches by breaking them into smaller chunks. Each chunk + is delegated to the inner runner's _forward_impl, making chunking + composable with any runner implementation. + + All MoERunnerBase state (moe_config, router, quant_method, etc.) is + transparently delegated to the inner runner via __getattr__. + ChunkingMoERunner only owns chunking-specific state: the pre-allocated + workspace buffers and the reduce_results override. + + Key behaviors: + - Pre-allocates workspace tensors for CUDA graph compatibility + - Processes chunks via inner._forward_impl per chunk + - Never reduces results (reduce_results always returns False) + """ + + def __init__(self, inner: MoERunnerBase): + # Assert that _maybe_dispatch/_maybe_combine will be nops. + assert inner.moe_config.pcp_size == 1 + + # Skip MoERunnerBase.__init__ — all state is delegated to inner + # via __getattr__. Only chunking-specific state lives here. + self._inner = inner + + # Pre-allocated staging buffers. These need to exist ahead of time + # due to CUDA graph construction needing fixed buffer addresses. + self.batched_hidden_states, self.batched_router_logits = ( + self._init_dp_chunking() + ) + + def __getattr__(self, name): + # Delegate attribute access to the inner runner. This is only + # called when normal lookup (instance __dict__, class MRO) fails, + # so ChunkingMoERunner's own attributes and methods take priority. + return getattr(self._inner, name) + + @property + def reduce_results(self) -> bool: + return False + + def _init_dp_chunking(self) -> list[torch.Tensor]: + states_shape: tuple[int, ...] + logits_shape: tuple[int, ...] + + moe = self.moe_config + + if self.enable_dbo: + states_shape = (2, moe.max_num_tokens, self.moe_config.hidden_dim) + logits_shape = (2, moe.max_num_tokens, self.moe_config.num_logical_experts) + else: + states_shape = (moe.max_num_tokens, self.moe_config.hidden_dim) + logits_shape = (moe.max_num_tokens, self.moe_config.num_logical_experts) + + # Does this need some kind of profiling run check like modular_kernel.py? + return current_workspace_manager().get_simultaneous( + (states_shape, moe.in_dtype), + (logits_shape, moe.router_logits_dtype), + ) + + def _allocate_dp_chunking_outputs( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> tuple[torch.Tensor | None, torch.Tensor]: + # Assert the inputs are of the proper type and shape. + assert self.batched_hidden_states is not None + assert self.batched_router_logits is not None + + assert self.batched_hidden_states.dtype == hidden_states.dtype, ( + f"{self.batched_hidden_states.dtype} == {hidden_states.dtype}" + ) + assert self.batched_router_logits.dtype == router_logits.dtype, ( + f"{self.batched_router_logits.dtype} == {router_logits.dtype}" + ) + + # Check size compatibility. + assert self.batched_hidden_states.size(-1) == hidden_states.size(-1) + assert self.batched_router_logits.size(-1) == router_logits.size(-1) + + final_fused_hidden_states = torch.empty_like(hidden_states) + if self.shared_experts is not None: + final_shared_hidden_states = torch.empty_like(hidden_states) + else: + final_shared_hidden_states = None + + return final_shared_hidden_states, final_fused_hidden_states + + def _slice_and_copy_input( + self, + out_slice: torch.Tensor, + orig: torch.Tensor | None, + start: int, + end: int, + ) -> torch.Tensor: + assert orig is not None + slice_size = end - start + orig_slice = orig[start:end, :] + if self.enable_dbo: + assert out_slice.dim() == 3 + batch_buffer_idx = dbo_current_ubatch_id() + out_slice = out_slice[batch_buffer_idx, :] + + assert out_slice.size(0) >= slice_size + out_slice = out_slice[:slice_size, :] + out_slice.copy_(orig_slice, non_blocking=True) + return out_slice + + def _forward_impl( + self, + layer: torch.nn.Module, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + final_shared_hidden_states, final_fused_hidden_states = ( + self._allocate_dp_chunking_outputs(hidden_states, router_logits) + ) + + ctx = get_forward_context() + # flashinfer_cutlass_kernels can handle: optional DP + TP/EP + max_tokens_across_dispatchers = ctx.dp_metadata.max_tokens_across_dp_cpu + moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens + + # If the input to the MoE is sequence parallel then divide by sp_size + # to find the maximum number of tokens for any individual dispatcher. + if self.moe_config.is_sequence_parallel: + max_tokens_across_dispatchers = cdiv( + max_tokens_across_dispatchers, self.moe_config.sp_size + ) + + num_tokens = hidden_states.size(0) + for chunk_idx, chunk_start_ in enumerate( + range(0, max_tokens_across_dispatchers, moe_dp_chunk_size_per_rank) + ): + chunk_start = chunk_start_ + chunk_end = min( + chunk_start + moe_dp_chunk_size_per_rank, max_tokens_across_dispatchers + ) + # clamp start and end + chunk_start = min(chunk_start, num_tokens - 1) + chunk_end = min(chunk_end, num_tokens) + chunk_sizes = ctx.dp_metadata.chunked_sizes( + self.moe_config.sp_size, moe_dp_chunk_size_per_rank, chunk_idx + ) + with chunk_sizes: + hidden_states_chunk = self._slice_and_copy_input( + self.batched_hidden_states, + hidden_states, + chunk_start, + chunk_end, + ) + + router_logits_chunk = self._slice_and_copy_input( + self.batched_router_logits, + router_logits, + chunk_start, + chunk_end, + ) + + shared_experts_input_chunk = ( + shared_experts_input[chunk_start:chunk_end, :] + if shared_experts_input is not None + else None + ) + + # Delegate per-chunk computation to the inner runner. + chunk_result = self._inner._forward_impl( + layer=layer, + hidden_states=hidden_states_chunk, + router_logits=router_logits_chunk, + shared_experts_input=shared_experts_input_chunk, + ) + + # Store outputs + # TODO(bnell): document when chunk_start >= num_tokens + if chunk_start < num_tokens: + if self.shared_experts is not None: + assert isinstance(chunk_result, tuple) + shared_output_chunk, hidden_states_chunk = chunk_result + final_fused_hidden_states[chunk_start:chunk_end, :].copy_( + hidden_states_chunk, non_blocking=True + ) + assert shared_output_chunk is not None + assert final_shared_hidden_states is not None + final_shared_hidden_states[chunk_start:chunk_end, :].copy_( + shared_output_chunk, non_blocking=True + ) + else: + assert isinstance(chunk_result, torch.Tensor) + final_fused_hidden_states[chunk_start:chunk_end, :].copy_( + chunk_result, non_blocking=True + ) + + if self.shared_experts is None: + return final_fused_hidden_states + else: + assert final_shared_hidden_states is not None + return (final_shared_hidden_states, final_fused_hidden_states) diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index 41a5c69e60d1..07d63cab2220 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -1,21 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable -from contextlib import nullcontext -from typing import TYPE_CHECKING import torch -import torch.nn.functional as F from vllm.distributed import ( get_ep_group, get_pcp_group, - tensor_model_parallel_all_reduce, -) -from vllm.forward_context import ( - ForwardContext, - get_forward_context, - is_forward_context_available, ) from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import ( @@ -27,152 +17,39 @@ from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( FusedMoERouter, ) -from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner +from vllm.model_executor.layers.fused_moe.runner.moe_runner_base import MoERunnerBase from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( SharedExperts, - SharedExpertsOrder, -) -from vllm.platforms import current_platform -from vllm.utils.math_utils import cdiv -from vllm.utils.torch_utils import ( - HAS_OPAQUE_TYPE, - ModuleName, - direct_register_custom_op, ) -from vllm.v1.worker.ubatching import dbo_current_ubatch_id logger = init_logger(__name__) -def get_layer_from_name(layer_name: str) -> torch.nn.Module: - forward_context: ForwardContext = get_forward_context() - if layer_name == "from_forward_context": - all_moe_layers = forward_context.all_moe_layers - assert all_moe_layers is not None - moe_layer_index = forward_context.moe_layer_index - if moe_layer_index >= len(all_moe_layers): - raise AssertionError( - "We expected the number of MOE layers in `all_moe_layers` " - "to be equal to the number of " - "{vllm.moe_forward, vllm.moe_forward_shared} calls." - ) - layer_name = all_moe_layers[moe_layer_index] - forward_context.moe_layer_index += 1 - return forward_context.no_compile_layers[layer_name] - - -# On torch >= 2.11, layer_name is a hoisted ModuleName opaque object; -# on older versions it remains a plain str. -if TYPE_CHECKING: - from typing import TypeAlias - - _layer_name_type: TypeAlias = str | ModuleName -else: - _layer_name_type = ModuleName if HAS_OPAQUE_TYPE else str - - -def _resolve_layer_name(layer_name: str | ModuleName) -> str: - return layer_name.value if isinstance(layer_name, ModuleName) else layer_name - - -# Note: _moe_forward and _moe_forward_shared should not contain any -# implementation details, They should merely pass along control to -# the runner's 'forward_dispatch' method. -def _moe_forward( - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - layer_name: _layer_name_type, -) -> torch.Tensor: - layer = get_layer_from_name(_resolve_layer_name(layer_name)) - return layer.runner.forward_dispatch( - layer, - hidden_states, - router_logits, - shared_experts_input, - ) - - -def _moe_forward_fake( - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - layer_name: _layer_name_type, -) -> torch.Tensor: - return torch.empty_like(hidden_states) - - -def _moe_forward_shared( - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - layer_name: _layer_name_type, -) -> tuple[torch.Tensor, torch.Tensor]: - layer = get_layer_from_name(_resolve_layer_name(layer_name)) - return layer.runner.forward_dispatch( - layer, - hidden_states, - router_logits, - shared_experts_input, - ) - - -def _moe_forward_shared_fake( - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - layer_name: _layer_name_type, -) -> tuple[torch.Tensor, torch.Tensor]: - # Output shapes: - # - fused_out: same as hidden_states (routed experts use transformed size) - # - shared_out: same as shared_experts_input if provided, else same as - # hidden_states - # (For latent MoE: shared experts use original hidden_size, not latent size) - fused_out = torch.empty_like(hidden_states) - if shared_experts_input is not None: - shared_out = torch.empty_like(shared_experts_input) - else: - shared_out = torch.empty_like(hidden_states) - return shared_out, fused_out - - -direct_register_custom_op( - op_name="moe_forward", - op_func=_moe_forward, - mutates_args=["hidden_states"], # is this still true? - fake_impl=_moe_forward_fake, - tags=(torch.Tag.needs_fixed_stride_order,), -) - - -direct_register_custom_op( - op_name="moe_forward_shared", - op_func=_moe_forward_shared, - fake_impl=_moe_forward_shared_fake, - tags=(torch.Tag.needs_fixed_stride_order,), -) - - -class DefaultMoERunner(MoERunner): +class DefaultMoERunner(MoERunnerBase): """ - Default implementation of the MoE runner for executing Mixture of Experts layers. + Standard MoE runner implementation for executing Mixture of Experts layers. - This class provides a comprehensive implementation for running MoE computations - with support for: - - Expert routing and token dispatching + This is the primary concrete implementation of MoE execution logic, providing + comprehensive support for standard MoE operations. It handles: + - Expert routing and token dispatching using various routing strategies - Shared experts computation with optional parallel execution using CUDA streams - - Data parallel (DP) chunking for large batch processing - Tensor model parallel and expert parallel operations - - Various quantization methods and custom operators + - Multiple quantization methods and optimized kernel selection - Both monolithic and decomposed expert execution paths + - Integration with various parallel execution modes (TP, EP, DP) - The runner handles the complete MoE forward pass including routing tokens to - experts, executing expert computations, and combining results. It supports - advanced features like overlapped execution of shared experts and optimized - kernels for different parallel execution modes. + The runner orchestrates the complete MoE forward pass including routing tokens + to experts, executing expert computations in parallel, and combining results. + It supports advanced features like overlapped execution of shared experts, + optimized kernels for different parallel configurations, and seamless + integration with vLLM's distributed execution framework. - Eventually, this class will be split up and specialized for different - configurations, e.g. the presence or absence of shared experts, a gate, etc. + This implementation is suitable for most standard MoE use cases. For specialized + scenarios like large batch chunking, alternative runners like ChunkingMoERunner + may be more appropriate. + + Eventually, this class may be split into more specialized implementations + for different configurations (e.g., with/without shared experts, gates, etc.). """ def __init__( @@ -187,316 +64,24 @@ def __init__( reduce_results: bool, enable_dbo: bool, ): - super().__init__() - self.moe_config = moe_config - self.router = router - self.routed_input_transform = routed_input_transform - self.gate = gate - self.shared_experts = shared_experts - self.quant_method = quant_method - self.reduce_results = reduce_results - self.enable_dbo = enable_dbo - self.enable_eplb = moe_config.moe_parallel_config.enable_eplb - - # Chunked all2all staging tensor - # These need to exist ahead of time due to CUDAgraph construction - # needing a fixed buffer address. - self.use_dp_chunking = self.moe_config.moe_parallel_config.use_dp_chunking - self.batched_hidden_states: torch.Tensor | None = None - self.batched_router_logits: torch.Tensor | None = None - self._maybe_init_dp_chunking() - - # Needed for string -> FusedMoE layer lookup in custom ops. - self.layer_name = layer.layer_name - - self.forward_entry, self.forward_impl = self._select_forward(layer) - - def _select_forward(self, layer: torch.nn.Module) -> tuple[Callable, Callable]: - # Select implementation based on presence of DP chunking. - forward_impl_fn = ( - self._forward_impl_chunked if self.use_dp_chunking else self._forward_impl - ) - - if current_platform.is_tpu() or current_platform.is_cpu(): - # TODO: Once the OOM issue for the TPU backend is resolved, we - # will switch to using the moe_forward custom op. - # Note: CPU doesn't require wrapped forward_impl. - return ( - _moe_forward if self.shared_experts is None else _moe_forward_shared, - forward_impl_fn, - ) - - return ( - torch.ops.vllm.moe_forward - if self.shared_experts is None - else torch.ops.vllm.moe_forward_shared, - forward_impl_fn, + super().__init__( + layer, + moe_config, + router, + routed_input_transform, + gate, + shared_experts, + quant_method, + reduce_results, + enable_dbo, ) def is_internal_router(self) -> bool: return self.gate is not None - def _maybe_init_dp_chunking(self): - if not self.use_dp_chunking: - return - - assert self.batched_hidden_states is None - states_shape: tuple[int, ...] - logits_shape: tuple[int, ...] - - moe = self.moe_config - - if self.enable_dbo: - states_shape = (2, moe.max_num_tokens, self.moe_config.hidden_dim) - logits_shape = (2, moe.max_num_tokens, self.moe_config.num_logical_experts) - else: - states_shape = (moe.max_num_tokens, self.moe_config.hidden_dim) - logits_shape = (moe.max_num_tokens, self.moe_config.num_logical_experts) - - device = torch.accelerator.current_device_index() - self.batched_hidden_states = torch.zeros( - states_shape, - dtype=moe.in_dtype, - device=device, - ) - - self.batched_router_logits = torch.zeros( - logits_shape, - dtype=moe.router_logits_dtype, - device=device, - ) - - def must_reduce_shared_expert_outputs(self) -> bool: - """ - The shared_experts are typically computed using the RowParallelLinear - layer. The result of this function is typically used as - the reduce_results argument to the module. - When just tensor-parallel is used, it is not required to reduce - the shared_experts results immediately. Instead we reduce at the - once at the end of the MoE op. (Refer to DeepSeekV2MoE module) - With EP and all2all kernels - this is no longer viable as all - GPU ranks in DP, produce the complete set of hidden_states. - Therefore it is required that we reduce the shared_experts output - early. - """ - return ( - self.quant_method.moe_kernel is not None - and self.quant_method.moe_kernel.output_is_reduced() - ) - - def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tensor): - """ - Some combine kernels reduce across GPU ranks by default. - """ - if self.must_reduce_shared_expert_outputs(): - return final_hidden_states - else: - return tensor_model_parallel_all_reduce(final_hidden_states) - - def apply_routed_input_transform( - self, hidden_states: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor | None]: - """Apply transform for routed experts (e.g., latent projection). - - This is called by FusedMoE.forward_native. The original hidden_states - is saved separately so shared experts get [S, hidden_size] while - routed experts get the transformed [S, moe_latent_size]. - - TODO: For latent MoE bandwidth optimization, fc2_latent_proj could be - moved inside SharedFusedMoE to all-reduce on the smaller latent - dimension. - - Returns (possibly transformed) hidden states and the input for shared - experts (or None if there are no shared experts). - """ - if self.routed_input_transform is not None: - result = self.routed_input_transform(hidden_states) - # ReplicatedLinear returns (output, extra_bias) tuple. - # We only need the output tensor; extra_bias is not used here. - if isinstance(result, tuple): - return result[0], hidden_states - return result, hidden_states - - return ( - hidden_states, - hidden_states if self.shared_experts is not None else None, - ) - - def _maybe_reduce_output( - self, - states: torch.Tensor | tuple[torch.Tensor, torch.Tensor], - trunc_sizes: list[int], - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - def trunc(x: torch.Tensor, trunc_size: int) -> torch.Tensor: - return x[..., :trunc_size] - - def reduce_and_trunc(x: torch.Tensor, trunc_size: int) -> torch.Tensor: - return trunc(self.maybe_all_reduce_tensor_model_parallel(x), trunc_size) - - if ( - not self.moe_config.is_sequence_parallel - and not self.use_dp_chunking - and self.reduce_results - and (self.moe_config.tp_size > 1 or self.moe_config.ep_size > 1) - ): - func = reduce_and_trunc - else: - func = trunc - - if isinstance(states, tuple): - return tuple( - [func(s, trunc_size) for s, trunc_size in zip(states, trunc_sizes)] - ) - else: - assert len(trunc_sizes) == 1 - return func(states, trunc_sizes[0]) - - def _encode_layer_name(self) -> str | ModuleName: - if HAS_OPAQUE_TYPE: - return ModuleName(self.layer_name) - # Can be unavailable or None in unittests - if ( - is_forward_context_available() - and get_forward_context().all_moe_layers is not None - ): - return "from_forward_context" - return self.layer_name - - def _maybe_pad_hidden_states( - self, - shared_experts_input: torch.Tensor | None, - hidden_states: torch.Tensor, - ) -> tuple[torch.Tensor, list[int]]: - shared_experts_hidden_dim = ( - shared_experts_input.shape[-1] if shared_experts_input is not None else 0 - ) - transformed_hidden_dim = hidden_states.shape[-1] - if ( - not self.quant_method.skip_forward_padding - and self.moe_config.hidden_dim != transformed_hidden_dim - ): - hidden_states = F.pad( - hidden_states, - (0, self.moe_config.hidden_dim - transformed_hidden_dim), - mode="constant", - value=0.0, - ) - - if self.shared_experts is not None: - orig_hidden_dims = [shared_experts_hidden_dim, transformed_hidden_dim] - else: - orig_hidden_dims = [transformed_hidden_dim] - - return hidden_states, orig_hidden_dims - - def _maybe_apply_shared_experts( - self, - shared_experts_input: torch.Tensor | None, - order: SharedExpertsOrder, - ): - if self.shared_experts is not None: - assert shared_experts_input is not None - self.shared_experts.apply(shared_experts_input, order) - - def _apply_quant_method( - self, - layer: torch.nn.Module, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - ) -> tuple[torch.Tensor | None, torch.Tensor]: - # Run this before quant_method to avoid inplace issues. - # TODO(bnell): probably not needed anymore since inplace is - # disabled when shared experts are present. - self._maybe_apply_shared_experts( - shared_experts_input, SharedExpertsOrder.NO_OVERLAP - ) - - if self.quant_method.is_monolithic: - fused_out = self.quant_method.apply_monolithic( - layer=layer, - x=hidden_states, - router_logits=router_logits, - ) - else: - topk_weights, topk_ids = self.router.select_experts( - hidden_states=hidden_states, - router_logits=router_logits, - ) - - fused_out = self.quant_method.apply( - layer=layer, - x=hidden_states, - topk_weights=topk_weights, - topk_ids=topk_ids, - shared_experts_input=shared_experts_input, - ) - - self._maybe_apply_shared_experts( - shared_experts_input, - SharedExpertsOrder.MULTI_STREAM_OVERLAPPED, - ) - - return ( - self.shared_experts.output if self.shared_experts is not None else None, - fused_out, - ) - - def _sequence_parallel_context(self): - ctx = get_forward_context() - return ( - ctx.dp_metadata.sp_local_sizes(self.moe_config.sp_size) - if ctx.dp_metadata - else nullcontext() - ) - - def _allocate_dp_chunking_outputs( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - ) -> tuple[torch.Tensor | None, torch.Tensor]: - assert self.use_dp_chunking - - # Assert the inputs are of the proper type and shape. - assert self.batched_hidden_states is not None - assert self.batched_router_logits is not None - - assert self.batched_hidden_states.dtype == hidden_states.dtype, ( - f"{self.batched_hidden_states.dtype} == {hidden_states.dtype}" - ) - assert self.batched_router_logits.dtype == router_logits.dtype, ( - f"{self.batched_router_logits.dtype} == {router_logits.dtype}" - ) - - # Check size compatibility. - assert self.batched_hidden_states.size(-1) == hidden_states.size(-1) - assert self.batched_router_logits.size(-1) == router_logits.size(-1) - - final_fused_hidden_states = torch.empty_like(hidden_states) - if self.shared_experts is not None: - final_shared_hidden_states = torch.empty_like(hidden_states) - else: - final_shared_hidden_states = None - - return final_shared_hidden_states, final_fused_hidden_states - - def _maybe_overlap_gate_with_shared_experts( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor: - # If router/gate provided, then apply it here. - # (Note: This code runs only when "overlapped mode" is on to allow - # parallel execution of shared experts with the FusedMoE via - # separate cuda stream) - if self.shared_experts is not None: - self.shared_experts.maybe_setup_shared_experts_stream(shared_experts_input) - - if self.gate is not None: - router_logits, _ = self.gate(hidden_states) - - return router_logits + @property + def reduce_results(self) -> bool: + return self._reduce_results @property def do_naive_dispatch_combine(self) -> bool: @@ -558,192 +143,6 @@ def _maybe_combine( else: return hidden_states - def forward( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - """Invoke the fused moe layer. - - Input: - - hidden_states - - router_logits - - Output: - - The new hidden_states. - or - - A tuple of (shared experts output, new hidden_states). - - Calling sequence - - forward - - self.forward_entry (_moe_forward or _moe_forward_shared custom op) - - forward_dispatch - - forward_impl (_forward_impl or _forward_impl_chunked) - - Note: The existence of _moe_forward and _moe_forward_shared custom ops are due - to the following reasons: - 1. the chunking loop in _forward_impl_chunked cannot be compiled by - torch.compile - 2. pytorch cannot handle union types in custom op signatures so _moe_forward - and _moe_forward_shared must be split. - - If _forward_impl_chunked can be implemented via torch.scan we can potentially - get rid of _moe_forward and _moe_forward_shared and collapse the whole sequence - into the 'forward' method. - """ - - # Apply transform for routed experts (e.g., latent projection for latent MoE) - hidden_states, shared_experts_input = self.apply_routed_input_transform( - hidden_states - ) - - hidden_states, og_hidden_dims = self._maybe_pad_hidden_states( - shared_experts_input, - hidden_states, - ) - - fused_output = self.forward_entry( - hidden_states, - router_logits, - shared_experts_input, - self._encode_layer_name(), - ) - - return self._maybe_reduce_output(fused_output, og_hidden_dims) - - def forward_dispatch( - self, - layer: torch.nn.Module, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - # TODO(bnell): this can be removed after MK migration is complete. - layer.ensure_moe_quant_config_init() - - router_logits = self._maybe_overlap_gate_with_shared_experts( - hidden_states, - router_logits, - shared_experts_input, - ) - - self._maybe_apply_shared_experts( - shared_experts_input, - SharedExpertsOrder.EXTERNAL, - ) - - with self._sequence_parallel_context(): - return self.forward_impl( - layer, - hidden_states, - router_logits, - shared_experts_input, - ) - - def _slice_and_copy_input( - self, - out_slice: torch.Tensor, - orig: torch.Tensor | None, - start: int, - end: int, - ) -> torch.Tensor: - assert orig is not None - slice_size = end - start - orig_slice = orig[start:end, :] - if self.enable_dbo: - assert out_slice.dim() == 3 - batch_buffer_idx = dbo_current_ubatch_id() - out_slice = out_slice[batch_buffer_idx, :] - - assert out_slice.size(0) >= slice_size - out_slice = out_slice[:slice_size, :] - out_slice.copy_(orig_slice, non_blocking=True) - return out_slice - - def _forward_impl_chunked( - self, - layer: torch.nn.Module, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - final_shared_hidden_states, final_fused_hidden_states = ( - self._allocate_dp_chunking_outputs(hidden_states, router_logits) - ) - - ctx = get_forward_context() - # flashinfer_cutlass_kernels can handle: optional DP + TP/EP - max_tokens_across_dispatchers = ctx.dp_metadata.max_tokens_across_dp_cpu - moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens - - # If the input to the MoE is sequence parallel then divide by sp_size - # to find the maximum number of tokens for any individual dispatcher. - if self.moe_config.is_sequence_parallel: - max_tokens_across_dispatchers = cdiv( - max_tokens_across_dispatchers, self.moe_config.sp_size - ) - - num_tokens = hidden_states.size(0) - for chunk_idx, chunk_start_ in enumerate( - range(0, max_tokens_across_dispatchers, moe_dp_chunk_size_per_rank) - ): - chunk_start = chunk_start_ - chunk_end = min( - chunk_start + moe_dp_chunk_size_per_rank, max_tokens_across_dispatchers - ) - # clamp start and end - chunk_start = min(chunk_start, num_tokens - 1) - chunk_end = min(chunk_end, num_tokens) - chunk_sizes = ctx.dp_metadata.chunked_sizes( - self.moe_config.sp_size, moe_dp_chunk_size_per_rank, chunk_idx - ) - with chunk_sizes: - hidden_states_chunk = self._slice_and_copy_input( - self.batched_hidden_states, - hidden_states, - chunk_start, - chunk_end, - ) - - router_logits_chunk = self._slice_and_copy_input( - self.batched_router_logits, - router_logits, - chunk_start, - chunk_end, - ) - - shared_experts_input_chunk = ( - shared_experts_input[chunk_start:chunk_end, :] - if shared_experts_input is not None - else None - ) - - shared_output_chunk, hidden_states_chunk = self._apply_quant_method( - layer=layer, - hidden_states=hidden_states_chunk, - router_logits=router_logits_chunk, - shared_experts_input=shared_experts_input_chunk, - ) - - # Store outputs - # TODO(bnell): document when chunk_start >= num_tokens - if chunk_start < num_tokens: - final_fused_hidden_states[chunk_start:chunk_end, :].copy_( - hidden_states_chunk, non_blocking=True - ) - if self.shared_experts is not None: - assert shared_output_chunk is not None - assert final_shared_hidden_states is not None - final_shared_hidden_states[chunk_start:chunk_end, :].copy_( - shared_output_chunk, non_blocking=True - ) - - if self.shared_experts is None: - return final_fused_hidden_states - else: - assert final_shared_hidden_states is not None - return (final_shared_hidden_states, final_fused_hidden_states) - def _forward_impl( self, layer: torch.nn.Module, diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py new file mode 100644 index 000000000000..2a6e5713827b --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -0,0 +1,501 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import abstractmethod +from collections.abc import Callable +from contextlib import nullcontext +from typing import TYPE_CHECKING + +import torch +import torch.nn.functional as F + +from vllm.distributed import ( + tensor_model_parallel_all_reduce, +) +from vllm.forward_context import ( + ForwardContext, + get_forward_context, + is_forward_context_available, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, +) +from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( + FusedMoEMethodBase, +) +from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( + FusedMoERouter, +) +from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner +from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( + SharedExperts, + SharedExpertsOrder, +) +from vllm.platforms import current_platform +from vllm.utils.torch_utils import ( + HAS_OPAQUE_TYPE, + ModuleName, + direct_register_custom_op, +) + +logger = init_logger(__name__) + + +def get_layer_from_name(layer_name: str) -> torch.nn.Module: + forward_context: ForwardContext = get_forward_context() + if layer_name == "from_forward_context": + all_moe_layers = forward_context.all_moe_layers + assert all_moe_layers is not None + moe_layer_index = forward_context.moe_layer_index + if moe_layer_index >= len(all_moe_layers): + raise AssertionError( + "We expected the number of MOE layers in `all_moe_layers` " + "to be equal to the number of " + "{vllm.moe_forward, vllm.moe_forward_shared} calls." + ) + layer_name = all_moe_layers[moe_layer_index] + forward_context.moe_layer_index += 1 + return forward_context.no_compile_layers[layer_name] + + +# On torch >= 2.11, layer_name is a hoisted ModuleName opaque object; +# on older versions it remains a plain str. +if TYPE_CHECKING: + from typing import TypeAlias + + _layer_name_type: TypeAlias = str | ModuleName +else: + _layer_name_type = ModuleName if HAS_OPAQUE_TYPE else str + + +def _resolve_layer_name(layer_name: str | ModuleName) -> str: + return layer_name.value if isinstance(layer_name, ModuleName) else layer_name + + +# Note: _moe_forward and _moe_forward_shared should not contain any +# implementation details, They should merely pass along control to +# the runner's 'forward_dispatch' method. +def _moe_forward( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + layer_name: _layer_name_type, +) -> torch.Tensor: + layer = get_layer_from_name(_resolve_layer_name(layer_name)) + return layer.runner.forward_dispatch( + layer, + hidden_states, + router_logits, + shared_experts_input, + ) + + +def _moe_forward_fake( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + layer_name: _layer_name_type, +) -> torch.Tensor: + return torch.empty_like(hidden_states) + + +def _moe_forward_shared( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + layer_name: _layer_name_type, +) -> tuple[torch.Tensor, torch.Tensor]: + layer = get_layer_from_name(_resolve_layer_name(layer_name)) + return layer.runner.forward_dispatch( + layer, + hidden_states, + router_logits, + shared_experts_input, + ) + + +def _moe_forward_shared_fake( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + layer_name: _layer_name_type, +) -> tuple[torch.Tensor, torch.Tensor]: + # Output shapes: + # - fused_out: same as hidden_states (routed experts use transformed size) + # - shared_out: same as shared_experts_input if provided, else same as + # hidden_states + # (For latent MoE: shared experts use original hidden_size, not latent size) + fused_out = torch.empty_like(hidden_states) + if shared_experts_input is not None: + shared_out = torch.empty_like(shared_experts_input) + else: + shared_out = torch.empty_like(hidden_states) + return shared_out, fused_out + + +direct_register_custom_op( + op_name="moe_forward", + op_func=_moe_forward, + mutates_args=["hidden_states"], # is this still true? + fake_impl=_moe_forward_fake, + tags=(torch.Tag.needs_fixed_stride_order,), +) + + +direct_register_custom_op( + op_name="moe_forward_shared", + op_func=_moe_forward_shared, + fake_impl=_moe_forward_shared_fake, + tags=(torch.Tag.needs_fixed_stride_order,), +) + + +class MoERunnerBase(MoERunner): + """ + Abstract base class providing common functionality for MoE runner implementations. + + This class serves as the foundation for concrete MoE runner implementations by + providing shared state management and common utilities. It handles: + - Common initialization and configuration management + - Shared expert output reduction logic for tensor parallel scenarios + - Base methods for tensor model parallel reductions + - Common properties and utility functions used across different runner types + + Concrete subclasses must implement the abstract methods to define their specific + execution strategies, such as standard execution, chunked processing, or other + specialized approaches. The base class provides the infrastructure while + allowing flexibility in the actual MoE computation implementation. + + Key abstract methods that subclasses must implement: + - reduce_results: Determines whether results should be reduced across ranks + - _forward_impl: The core MoE computation logic specific to each runner type + """ + + def __init__( + self, + layer: torch.nn.Module, + moe_config: FusedMoEConfig, + router: FusedMoERouter, + routed_input_transform: torch.nn.Module | None, + gate: torch.nn.Module | None, + shared_experts: SharedExperts | None, + quant_method: FusedMoEMethodBase, + reduce_results: bool, + enable_dbo: bool, + ): + super().__init__() + self.moe_config = moe_config + self.router = router + self.routed_input_transform = routed_input_transform + self.gate = gate + self.shared_experts = shared_experts + self.quant_method = quant_method + self._reduce_results = reduce_results + self.enable_dbo = enable_dbo + self.enable_eplb = moe_config.moe_parallel_config.enable_eplb + + # Needed for string -> FusedMoE layer lookup in custom ops. + self.layer_name = layer.layer_name + + self.forward_entry = self._select_forward(layer) + + def _select_forward(self, layer: torch.nn.Module) -> Callable: + if current_platform.is_tpu() or current_platform.is_cpu(): + # TODO: Once the OOM issue for the TPU backend is resolved, we + # will switch to using the moe_forward custom op. + # Note: CPU doesn't require wrapped _forward_impl. + return _moe_forward if self.shared_experts is None else _moe_forward_shared + + return ( + torch.ops.vllm.moe_forward + if self.shared_experts is None + else torch.ops.vllm.moe_forward_shared + ) + + @property + @abstractmethod + def reduce_results(self) -> bool: + raise NotImplementedError + + def must_reduce_shared_expert_outputs(self) -> bool: + """ + The shared_experts are typically computed using the RowParallelLinear + layer. The result of this function is typically used as + the reduce_results argument to the module. + When just tensor-parallel is used, it is not required to reduce + the shared_experts results immediately. Instead we reduce at the + once at the end of the MoE op. (Refer to DeepSeekV2MoE module) + With EP and all2all kernels - this is no longer viable as all + GPU ranks in DP, produce the complete set of hidden_states. + Therefore it is required that we reduce the shared_experts output + early. + """ + return ( + self.quant_method.moe_kernel is not None + and self.quant_method.moe_kernel.output_is_reduced() + ) + + def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tensor): + """ + Some combine kernels reduce across GPU ranks by default. + """ + if self.must_reduce_shared_expert_outputs(): + return final_hidden_states + else: + return tensor_model_parallel_all_reduce(final_hidden_states) + + def apply_routed_input_transform( + self, hidden_states: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor | None]: + """Apply transform for routed experts (e.g., latent projection). + + This is called by FusedMoE.forward_native. The original hidden_states + is saved separately so shared experts get [S, hidden_size] while + routed experts get the transformed [S, moe_latent_size]. + + TODO: For latent MoE bandwidth optimization, fc2_latent_proj could be + moved inside SharedFusedMoE to all-reduce on the smaller latent + dimension. + + Returns (possibly transformed) hidden states and the input for shared + experts (or None if there are no shared experts). + """ + if self.routed_input_transform is not None: + result = self.routed_input_transform(hidden_states) + # ReplicatedLinear returns (output, extra_bias) tuple. + # We only need the output tensor; extra_bias is not used here. + if isinstance(result, tuple): + return result[0], hidden_states + return result, hidden_states + + return ( + hidden_states, + hidden_states if self.shared_experts is not None else None, + ) + + def _maybe_reduce_output( + self, + states: torch.Tensor | tuple[torch.Tensor, torch.Tensor], + trunc_sizes: list[int], + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + def trunc(x: torch.Tensor, trunc_size: int) -> torch.Tensor: + return x[..., :trunc_size] + + def reduce_and_trunc(x: torch.Tensor, trunc_size: int) -> torch.Tensor: + return trunc(self.maybe_all_reduce_tensor_model_parallel(x), trunc_size) + + if ( + not self.moe_config.is_sequence_parallel + and self.reduce_results + and (self.moe_config.tp_size > 1 or self.moe_config.ep_size > 1) + ): + func = reduce_and_trunc + else: + func = trunc + + if isinstance(states, tuple): + return tuple( + [func(s, trunc_size) for s, trunc_size in zip(states, trunc_sizes)] + ) + else: + assert len(trunc_sizes) == 1 + return func(states, trunc_sizes[0]) + + def _encode_layer_name(self) -> str | ModuleName: + if HAS_OPAQUE_TYPE: + return ModuleName(self.layer_name) + # Can be unavailable or None in unittests + if ( + is_forward_context_available() + and get_forward_context().all_moe_layers is not None + ): + return "from_forward_context" + return self.layer_name + + def _maybe_pad_hidden_states( + self, + shared_experts_input: torch.Tensor | None, + hidden_states: torch.Tensor, + ) -> tuple[torch.Tensor, list[int]]: + shared_experts_hidden_dim = ( + shared_experts_input.shape[-1] if shared_experts_input is not None else 0 + ) + transformed_hidden_dim = hidden_states.shape[-1] + if ( + not self.quant_method.skip_forward_padding + and self.moe_config.hidden_dim != transformed_hidden_dim + ): + hidden_states = F.pad( + hidden_states, + (0, self.moe_config.hidden_dim - transformed_hidden_dim), + mode="constant", + value=0.0, + ) + + if self.shared_experts is not None: + orig_hidden_dims = [shared_experts_hidden_dim, transformed_hidden_dim] + else: + orig_hidden_dims = [transformed_hidden_dim] + + return hidden_states, orig_hidden_dims + + def _maybe_apply_shared_experts( + self, + shared_experts_input: torch.Tensor | None, + order: SharedExpertsOrder, + ): + if self.shared_experts is not None: + assert shared_experts_input is not None + self.shared_experts.apply(shared_experts_input, order) + + def _apply_quant_method( + self, + layer: torch.nn.Module, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + ) -> tuple[torch.Tensor | None, torch.Tensor]: + # Run this before quant_method to avoid inplace issues. + self._maybe_apply_shared_experts( + shared_experts_input, + SharedExpertsOrder.BEFORE_QUANT_METHOD, + ) + + if self.quant_method.is_monolithic: + fused_out = self.quant_method.apply_monolithic( + layer=layer, + x=hidden_states, + router_logits=router_logits, + ) + else: + topk_weights, topk_ids = self.router.select_experts( + hidden_states=hidden_states, + router_logits=router_logits, + ) + + fused_out = self.quant_method.apply( + layer=layer, + x=hidden_states, + topk_weights=topk_weights, + topk_ids=topk_ids, + shared_experts_input=shared_experts_input, + ) + + self._maybe_apply_shared_experts( + shared_experts_input, + SharedExpertsOrder.AFTER_QUANT_METHOD, + ) + + return ( + self.shared_experts.output if self.shared_experts is not None else None, + fused_out, + ) + + def _sequence_parallel_context(self): + ctx = get_forward_context() + return ( + ctx.dp_metadata.sp_local_sizes(self.moe_config.sp_size) + if ctx.dp_metadata + else nullcontext() + ) + + def _maybe_gate( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> torch.Tensor: + # If router/gate provided, then apply it here. + # (Note: This code runs only when "overlapped mode" is on to allow + # parallel execution of shared experts with the FusedMoE via + # separate cuda stream) + if self.gate is not None: + router_logits, _ = self.gate(hidden_states) + return router_logits + + def forward( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + """Invoke the fused moe layer. + + Input: + - hidden_states + - router_logits + + Output: + - The new hidden_states. + or + - A tuple of (shared experts output, new hidden_states). + + Calling sequence + - forward + - self.forward_entry (_moe_forward or _moe_forward_shared custom op) + - forward_dispatch + - _forward_impl + + Note: The existence of _moe_forward and _moe_forward_shared custom ops are due + to the following reasons: + 1. the chunking loop in ChunkingMoERunner._forward_impl cannot be compiled by + torch.compile + 2. pytorch cannot handle union types in custom op signatures so _moe_forward + and _moe_forward_shared must be split. + + If ChunkingMoERunner._forward_impl can be implemented via torch.scan we can + potentially get rid of _moe_forward and _moe_forward_shared and collapse the + whole sequence into the 'forward' method. + """ + + # Apply transform for routed experts (e.g., latent projection for latent MoE) + hidden_states, shared_experts_input = self.apply_routed_input_transform( + hidden_states + ) + + hidden_states, og_hidden_dims = self._maybe_pad_hidden_states( + shared_experts_input, + hidden_states, + ) + + fused_output = self.forward_entry( + hidden_states, + router_logits, + shared_experts_input, + self._encode_layer_name(), + ) + + return self._maybe_reduce_output(fused_output, og_hidden_dims) + + def forward_dispatch( + self, + layer: torch.nn.Module, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + # TODO(bnell): this can be removed after MK migration is complete. + layer.ensure_moe_quant_config_init() + + router_logits = self._maybe_gate(hidden_states, router_logits) + + self._maybe_apply_shared_experts( + shared_experts_input, + SharedExpertsOrder.EXTERNAL, + ) + + with self._sequence_parallel_context(): + return self._forward_impl( + layer, + hidden_states, + router_logits, + shared_experts_input, + ) + + @abstractmethod + def _forward_impl( + self, + layer: torch.nn.Module, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + raise NotImplementedError diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py new file mode 100644 index 000000000000..4bda1ca89304 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, +) +from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( + FusedMoEMethodBase, +) +from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( + FusedMoERouter, +) +from vllm.model_executor.layers.fused_moe.runner.chunking_moe_runner import ( + ChunkingMoERunner, +) +from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import ( + DefaultMoERunner, +) +from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner +from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( + SharedExperts, +) + + +def create_moe_runner( + layer: torch.nn.Module, + moe_config: FusedMoEConfig, + router: FusedMoERouter, + routed_input_transform: torch.nn.Module | None, + gate: torch.nn.Module | None, + shared_experts: SharedExperts | None, + quant_method: FusedMoEMethodBase, + reduce_results: bool, + enable_dbo: bool, +) -> MoERunner: + runner = DefaultMoERunner( + layer, + moe_config, + router, + routed_input_transform, + gate, + shared_experts, + quant_method, + reduce_results, + enable_dbo, + ) + if moe_config.moe_parallel_config.use_dp_chunking: + return ChunkingMoERunner(runner) + return runner From bdefdf5216d6dccce8341e8c4498d57d0c3800a0 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 25 Mar 2026 03:07:43 +0000 Subject: [PATCH 035/191] fix merge Signed-off-by: Bill Nell --- .../layers/fused_moe/runner/default_moe_runner.py | 3 --- .../layers/fused_moe/runner/moe_runner_base.py | 7 +++++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index 07d63cab2220..7b583bda14cc 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -76,9 +76,6 @@ def __init__( enable_dbo, ) - def is_internal_router(self) -> bool: - return self.gate is not None - @property def reduce_results(self) -> bool: return self._reduce_results diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index 2a6e5713827b..c62eadbf263c 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -212,6 +212,9 @@ def _select_forward(self, layer: torch.nn.Module) -> Callable: else torch.ops.vllm.moe_forward_shared ) + def is_internal_router(self) -> bool: + return self.gate is not None + @property @abstractmethod def reduce_results(self) -> bool: @@ -358,7 +361,7 @@ def _apply_quant_method( # Run this before quant_method to avoid inplace issues. self._maybe_apply_shared_experts( shared_experts_input, - SharedExpertsOrder.BEFORE_QUANT_METHOD, + SharedExpertsOrder.NO_OVERLAP, ) if self.quant_method.is_monolithic: @@ -383,7 +386,7 @@ def _apply_quant_method( self._maybe_apply_shared_experts( shared_experts_input, - SharedExpertsOrder.AFTER_QUANT_METHOD, + SharedExpertsOrder.MULTI_STREAM_OVERLAPPED, ) return ( From 377acc859eeb00b6487b5acd906575f0e2019ff7 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 25 Mar 2026 03:24:35 +0000 Subject: [PATCH 036/191] fix merge Signed-off-by: Bill Nell --- .../layers/fused_moe/runner/moe_runner_base.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index c62eadbf263c..078085d23f2e 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -402,17 +402,22 @@ def _sequence_parallel_context(self): else nullcontext() ) - def _maybe_gate( + def _maybe_overlap_gate_with_shared_experts( self, hidden_states: torch.Tensor, router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: # If router/gate provided, then apply it here. # (Note: This code runs only when "overlapped mode" is on to allow # parallel execution of shared experts with the FusedMoE via # separate cuda stream) + if self.shared_experts is not None: + self.shared_experts.maybe_setup_shared_experts_stream(shared_experts_input) + if self.gate is not None: router_logits, _ = self.gate(hidden_states) + return router_logits def forward( @@ -478,7 +483,11 @@ def forward_dispatch( # TODO(bnell): this can be removed after MK migration is complete. layer.ensure_moe_quant_config_init() - router_logits = self._maybe_gate(hidden_states, router_logits) + router_logits = self._maybe_overlap_gate_with_shared_experts( + hidden_states, + router_logits, + shared_experts_input, + ) self._maybe_apply_shared_experts( shared_experts_input, From 4fe1531a5ffc67555e239bccb8008d493c7cbdce Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 26 Feb 2026 18:00:45 -0500 Subject: [PATCH 037/191] attempt to fix zero experts Signed-off-by: Bill Nell --- .../moe-refactor/LongCat-Flash-Chat-FP8.yaml | 10 ++ .../configs/moe-refactor/config-h100.txt | 1 + vllm/model_executor/layers/fused_moe/layer.py | 43 +++--- .../layers/fused_moe/zero_expert_fused_moe.py | 128 ++++++++++-------- vllm/model_executor/models/longcat_flash.py | 2 +- 5 files changed, 105 insertions(+), 79 deletions(-) create mode 100644 tests/evals/gsm8k/configs/moe-refactor/LongCat-Flash-Chat-FP8.yaml diff --git a/tests/evals/gsm8k/configs/moe-refactor/LongCat-Flash-Chat-FP8.yaml b/tests/evals/gsm8k/configs/moe-refactor/LongCat-Flash-Chat-FP8.yaml new file mode 100644 index 000000000000..ca5c9a00ed37 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/LongCat-Flash-Chat-FP8.yaml @@ -0,0 +1,10 @@ +model_name: "meituan-longcat/LongCat-Flash-Chat-FP8" +accuracy_threshold: 0.70 +num_questions: 1319 +num_fewshot: 5 +startup_max_wait_seconds: 1200 +server_args: >- + --enforce-eager + --max-model-len 4096 + --tensor-parallel-size 8 + --enable-expert-parallel diff --git a/tests/evals/gsm8k/configs/moe-refactor/config-h100.txt b/tests/evals/gsm8k/configs/moe-refactor/config-h100.txt index 7397fc4e4626..912ba878fae6 100644 --- a/tests/evals/gsm8k/configs/moe-refactor/config-h100.txt +++ b/tests/evals/gsm8k/configs/moe-refactor/config-h100.txt @@ -10,3 +10,4 @@ Qwen3-30B-A3B-Fp8-CT-Channel-marlin.yaml Qwen3-30B-A3B-Fp8-CT-Channel-vllm-cutlass.yaml Qwen3-30B-A3B-BF16-fi-cutlass.yaml Qwen3-30B-A3B-BF16-triton.yaml +LongCat-Flash-Chat-FP8.yaml diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 5a83d5138168..0197c627af64 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -36,6 +36,9 @@ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( init_aiter_topK_meta_data, ) +from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( + FusedMoERouter, +) from vllm.model_executor.layers.fused_moe.router.router_factory import ( create_fused_moe_router, ) @@ -317,6 +320,7 @@ def __init__( gate: torch.nn.Module | None = None, shared_experts: torch.nn.Module | None = None, routed_input_transform: torch.nn.Module | None = None, + router: FusedMoERouter | None = None, ): super().__init__() @@ -488,24 +492,27 @@ def __init__( # TODO(bnell): we should not have to create a router if the kernel is # monolithic. - self.router = create_fused_moe_router( - top_k=top_k, - global_num_experts=self.global_num_experts, - eplb_state=self.eplb_state, - renormalize=renormalize, - use_grouped_topk=use_grouped_topk, - num_expert_group=num_expert_group, - topk_group=topk_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - num_fused_shared_experts=self.num_fused_shared_experts, - enable_eplb=enable_eplb, - # TODO(bnell): once we can construct the MK at init time, we - # can make this a value. - indices_type_getter=lambda: self.quant_method.topk_indices_dtype, - ) + if router is not None: + self.router = router + else: + self.router = create_fused_moe_router( + top_k=top_k, + global_num_experts=self.global_num_experts, + eplb_state=self.eplb_state, + renormalize=renormalize, + use_grouped_topk=use_grouped_topk, + num_expert_group=num_expert_group, + topk_group=topk_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + num_fused_shared_experts=self.num_fused_shared_experts, + enable_eplb=enable_eplb, + # TODO(bnell): once we can construct the MK at init time, we + # can make this a value. + indices_type_getter=lambda: self.quant_method.topk_indices_dtype, + ) self.routing_method_type: RoutingMethodType = self.router.routing_method_type # Round up hidden size before creating moe_config. diff --git a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py index 97d21767f4fc..eaadf0ad6eed 100644 --- a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py @@ -1,13 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from contextlib import contextmanager - import torch -from torch import nn from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton from vllm.model_executor.layers.fused_moe.layer import FusedMoE +from vllm.model_executor.layers.fused_moe.router.fused_topk_bias_router import ( + fused_topk_bias, +) class ZeroExpertFusedMoE(FusedMoE): @@ -25,7 +25,7 @@ def __init__( self, zero_expert_num: int, zero_expert_type: str, - router: nn.Module, + e_score_correction_bias: torch.Tensor | None = None, **kwargs, ): # ZeroExpertFusedMoE manages its own custom_routing_function for memoization @@ -37,19 +37,35 @@ def __init__( "It manages its own for routing memoization." ) - # Automatically slice router's e_score_correction_bias to only include - # real experts (not zero_experts) for the base FusedMoE. - # The full bias will be used temporarily in forward() for routing. - if hasattr(router, "e_score_correction_bias") and "num_experts" in kwargs: + # Slice e_score_correction_bias to only include real experts + # (not zero experts) for the base FusedMoE. The full bias will be + # used temporarily in forward() for routing. + if e_score_correction_bias is not None and "num_experts" in kwargs: num_real_experts = kwargs["num_experts"] - router_bias = router.e_score_correction_bias user_bias = kwargs.get("e_score_correction_bias") - # Use router's bias if: - # 1. User didn't provide bias, or - # 2. User provided full bias (same size as router) - if user_bias is None or user_bias.shape[0] == router_bias.shape[0]: - kwargs["e_score_correction_bias"] = router_bias[:num_real_experts] + if ( + user_bias is None + or user_bias.shape[0] == e_score_correction_bias.shape[0] + ): + kwargs["e_score_correction_bias"] = e_score_correction_bias[ + :num_real_experts + ] + + # Create memoizing routing function BEFORE super().__init__() so it + # gets passed to create_fused_moe_router, which will create a + # CustomRoutingRouter that uses it. The closure captures `self` and + # accesses memoization state at call time (not definition time). + def custom_routing_function(hidden_states, gating_output, topk, renormalize): + """Return memoized `topk_weights` and `topk_ids`.""" + if self._memoized_topk_weights is None or self._memoized_topk_ids is None: + raise RuntimeError( + "ZeroExpertFusedMoE: routing results not memoized. " + "Call select_experts first to compute routing." + ) + return self._memoized_topk_weights, self._memoized_topk_ids + + kwargs["custom_routing_function"] = custom_routing_function # FusedMoE no longer accepts zero_expert_num/zero_expert_type. # We handle zero experts ourselves in forward(). @@ -57,7 +73,8 @@ def __init__( # Store the actual zero_expert_num and zero_expert_type for our own use self._actual_zero_expert_num = zero_expert_num self._actual_zero_expert_type = zero_expert_type - self._router = router # Full router (includes zero experts) + # Full e_score_correction_bias (includes zero experts) + self._full_e_score_correction_bias = e_score_correction_bias # Expose zero_expert_num and zero_expert_type as attributes for # compatibility with quantization methods that check these attributes @@ -68,38 +85,36 @@ def __init__( self._memoized_topk_weights: torch.Tensor | None = None self._memoized_topk_ids: torch.Tensor | None = None - # Create custom_routing_function to reuse memoized routing results - def custom_routing_function(hidden_states, gating_output, topk, renormalize): - """Return memoized `topk_weights` and `topk_ids`.""" - if self._memoized_topk_weights is None or self._memoized_topk_ids is None: - raise RuntimeError( - "ZeroExpertFusedMoE: routing results not memoized. " - "Call select_experts first to compute routing." - ) - return self._memoized_topk_weights, self._memoized_topk_ids - - self.custom_routing_function = custom_routing_function + def select_experts( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + """Route with full e_score_correction_bias (including zero experts). - @contextmanager - def _temporarily_set_attrs(self, **attrs): - """ - Temporarily set attributes using object.__setattr__ and restore them. - - This bypasses nn.Module.__setattr__ to avoid Dynamo tracing issues. - When PyTorch Dynamo traces the forward pass, it cannot handle - nn.Module.__setattr__ calls (which include parameter registration logic), - resulting in "Unsupported" errors. Using object.__setattr__ directly - sets the attribute without triggering nn.Module's custom __setattr__, - allowing Dynamo to trace the code successfully. + This bypasses self.router (which is a memoizing CustomRoutingRouter) + to perform the actual routing computation with the full bias. """ - originals = {key: getattr(self, key) for key in attrs} - try: - for key, value in attrs.items(): - object.__setattr__(self, key, value) - yield - finally: - for key, value in originals.items(): - object.__setattr__(self, key, value) + if self._full_e_score_correction_bias is not None: + topk_weights, topk_ids = fused_topk_bias( + hidden_states=hidden_states, + gating_output=router_logits, + e_score_correction_bias=self._full_e_score_correction_bias.data, + topk=self.top_k, + renormalize=self.renormalize, + scoring_func=self.scoring_func, + ) + else: + from vllm.model_executor.layers.fused_moe import fused_topk + + topk_weights, topk_ids = fused_topk( + hidden_states, router_logits, self.top_k, self.renormalize + ) + + if self.routed_scaling_factor != 1.0: + topk_weights = topk_weights * self.routed_scaling_factor + + return topk_weights, topk_ids def _compute_zero_expert_result( self, @@ -138,21 +153,14 @@ def forward( Returns: Combined output from real experts and zero experts """ - # Prepare temporary attribute overrides for routing computation - temp_attrs = { - "custom_routing_function": None, # Disable for first routing - } - if self._router is not None: - temp_attrs["e_score_correction_bias"] = self._router.e_score_correction_bias - - # Compute routing with temporary attributes - # Pass full router_logits (including zero experts) so that zero experts - # can be properly identified in topk_ids - with self._temporarily_set_attrs(**temp_attrs): - topk_weights, topk_ids = self.select_experts( - hidden_states=hidden_states, - router_logits=router_logits, # Full logits (includes zero experts) - ) + # Compute routing with full logits (including zero experts) so that + # zero experts can be properly identified in topk_ids. + # This bypasses self.router (the memoizing CustomRoutingRouter) and + # performs routing directly with the full e_score_correction_bias. + topk_weights, topk_ids = self.select_experts( + hidden_states=hidden_states, + router_logits=router_logits, + ) # Compute zero expert result if needed zero_expert_result = self._compute_zero_expert_result( diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py index a9e2c2268ee1..b695e062d276 100644 --- a/vllm/model_executor/models/longcat_flash.py +++ b/vllm/model_executor/models/longcat_flash.py @@ -297,7 +297,7 @@ def __init__( self.experts = ZeroExpertFusedMoE( zero_expert_num=config.zero_expert_num, zero_expert_type=config.zero_expert_type, - router=self.router, + e_score_correction_bias=self.router.e_score_correction_bias, num_experts=num_experts, top_k=top_k, hidden_size=hidden_size, From b6ba9200a4c94d73cd9e6930b2bf6edb1fd442b6 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 26 Feb 2026 19:06:00 -0500 Subject: [PATCH 038/191] simplify ZeroExpertFusedMoE and add ZeroExpertRouter Signed-off-by: Bill Nell --- tests/kernels/moe/test_zero_expert_moe.py | 114 ++++++++++ .../fused_moe/router/zero_expert_router.py | 115 +++++++++++ .../layers/fused_moe/zero_expert_fused_moe.py | 194 ++++++------------ 3 files changed, 287 insertions(+), 136 deletions(-) create mode 100644 tests/kernels/moe/test_zero_expert_moe.py create mode 100644 vllm/model_executor/layers/fused_moe/router/zero_expert_router.py diff --git a/tests/kernels/moe/test_zero_expert_moe.py b/tests/kernels/moe/test_zero_expert_moe.py new file mode 100644 index 000000000000..40ac7d90741d --- /dev/null +++ b/tests/kernels/moe/test_zero_expert_moe.py @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for ZeroExpertFusedMoE. + +Verifies that: +- The ZeroExpertRouter is properly created and used as the layer router. +- A forward pass through ZeroExpertFusedMoE produces correct output. +""" + +import pytest +import torch + +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.forward_context import get_forward_context, set_forward_context +from vllm.model_executor.layers.fused_moe.router.zero_expert_router import ( + ZeroExpertRouter, +) +from vllm.model_executor.layers.fused_moe.zero_expert_fused_moe import ( + ZeroExpertFusedMoE, +) +from vllm.v1.worker.workspace import init_workspace_manager + + +@pytest.fixture +def zero_expert_moe(dist_init, default_vllm_config): + """Create a ZeroExpertFusedMoE layer with zero experts.""" + num_experts = 4 + top_k = 2 + hidden_size = 128 + intermediate_size = 256 + zero_expert_num = 1 + + e_score_correction_bias = torch.zeros( + num_experts + zero_expert_num, + dtype=torch.float32, + device="cuda", + ) + + vllm_config = VllmConfig() + vllm_config.compilation_config.static_forward_context = dict() + + with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config): + init_workspace_manager(torch.cuda.current_device()) + + layer = ZeroExpertFusedMoE( + zero_expert_num=zero_expert_num, + zero_expert_type="identity", + e_score_correction_bias=e_score_correction_bias, + num_experts=num_experts, + top_k=top_k, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + params_dtype=torch.bfloat16, + prefix="test_zero_expert_moe", + renormalize=False, + routed_scaling_factor=1.0, + scoring_func="softmax", + ).cuda() + + layer.quant_method.process_weights_after_loading(layer) + + yield layer, vllm_config + + +@pytest.mark.parametrize("num_tokens", [1, 32]) +def test_zero_expert_moe_router_is_zero_expert_router(zero_expert_moe, num_tokens): + """Verify that ZeroExpertFusedMoE creates a ZeroExpertRouter.""" + layer, _ = zero_expert_moe + assert isinstance(layer.router, ZeroExpertRouter), ( + f"Expected ZeroExpertRouter but got {type(layer.router).__name__}." + ) + + +@pytest.mark.parametrize("num_tokens", [1, 32]) +def test_zero_expert_moe_no_custom_routing_fn(zero_expert_moe, num_tokens): + """Verify that custom_routing_function is not set (routing is handled + by ZeroExpertRouter, not a memoizing closure).""" + layer, _ = zero_expert_moe + assert layer.custom_routing_function is None + + +@pytest.mark.parametrize("num_tokens", [1, 32]) +def test_zero_expert_moe_forward(zero_expert_moe, num_tokens): + """Run a forward pass through ZeroExpertFusedMoE and verify output shape.""" + layer, vllm_config = zero_expert_moe + + hidden_size = layer.hidden_size + num_experts = 4 + zero_expert_num = 1 + total_experts = num_experts + zero_expert_num + + hidden_states = torch.randn( + num_tokens, hidden_size, dtype=torch.bfloat16, device="cuda" + ) + router_logits = torch.randn( + num_tokens, total_experts, dtype=torch.float32, device="cuda" + ) + + # Initialize weights to small random values to avoid NaN from + # uninitialized memory. + with torch.no_grad(): + for param in layer.parameters(): + if param.dtype.is_floating_point: + param.normal_(0, 0.01) + + with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config): + get_forward_context().all_moe_layers = None + output = layer.forward(hidden_states, router_logits) + + assert output.shape == hidden_states.shape, ( + f"Expected output shape {hidden_states.shape}, got {output.shape}" + ) + assert output.dtype == hidden_states.dtype + assert not torch.isnan(output).any(), "Output contains NaN values" diff --git a/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py b/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py new file mode 100644 index 000000000000..c87070bc5acf --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py @@ -0,0 +1,115 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable + +import torch + +from vllm.distributed.eplb.eplb_state import EplbLayerState +from vllm.model_executor.layers.fused_moe.config import ( + RoutingMethodType, + get_routing_method_type, +) +from vllm.model_executor.layers.fused_moe.fused_moe import ( + zero_experts_compute_triton, +) +from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter +from vllm.model_executor.layers.fused_moe.router.fused_topk_bias_router import ( + fused_topk_bias, +) + + +class ZeroExpertRouter(BaseRouter): + """Router that handles zero expert computation as part of routing. + + Routes over all experts (real + zero) using full e_score_correction_bias. + Computes zero expert identity contributions as a side effect during routing. + Remaps zero expert IDs to real expert ID 0 (with weight 0) so downstream + MoE computation can ignore them. + """ + + def __init__( + self, + top_k: int, + global_num_experts: int, + eplb_state: EplbLayerState, + e_score_correction_bias: torch.Tensor, + num_logical_experts: int, + zero_expert_type: str, + scoring_func: str = "softmax", + renormalize: bool = False, + routed_scaling_factor: float = 1.0, + enable_eplb: bool = False, + indices_type_getter: Callable[[], torch.dtype | None] | None = None, + ): + super().__init__( + top_k=top_k, + global_num_experts=global_num_experts, + eplb_state=eplb_state, + enable_eplb=enable_eplb, + indices_type_getter=indices_type_getter, + ) + self.e_score_correction_bias = e_score_correction_bias + self.num_logical_experts = num_logical_experts + self.zero_expert_type = zero_expert_type + self.scoring_func = scoring_func + self.renormalize = renormalize + self.routed_scaling_factor = routed_scaling_factor + self._zero_expert_output: torch.Tensor | None = None + + @property + def routing_method_type(self) -> RoutingMethodType: + return get_routing_method_type( + scoring_func=self.scoring_func, + top_k=self.top_k, + renormalize=self.renormalize, + num_expert_group=None, + has_e_score_bias=True, + ) + + def _compute_routing( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + indices_type: torch.dtype | None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """Compute routing with full bias, compute zero expert output, + mask zero expert IDs.""" + topk_weights, topk_ids = fused_topk_bias( + hidden_states=hidden_states, + gating_output=router_logits, + e_score_correction_bias=self.e_score_correction_bias.data, + topk=self.top_k, + renormalize=self.renormalize, + scoring_func=self.scoring_func, + indices_type=indices_type, + ) + + if self.routed_scaling_factor != 1.0: + topk_weights *= self.routed_scaling_factor + + # Compute zero expert output using pre-EPLB topk_ids/weights. + # zero_experts_compute_triton modifies its inputs in-place, so + # pass clones. + self._zero_expert_output = zero_experts_compute_triton( + expert_indices=topk_ids.clone(), + expert_scales=topk_weights.clone(), + num_experts=self.num_logical_experts, + zero_expert_type=self.zero_expert_type, + hidden_states=hidden_states, + ) + + # Mask zero expert entries: remap zero expert IDs to 0 with weight 0 + # so downstream MoE computation ignores them. + zero_mask = topk_ids >= self.num_logical_experts + topk_ids[zero_mask] = 0 + topk_weights[zero_mask] = 0.0 + + return topk_weights, topk_ids + + @property + def zero_expert_output(self) -> torch.Tensor | None: + """Retrieve and clear the zero expert output.""" + output = self._zero_expert_output + self._zero_expert_output = None + return output diff --git a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py index eaadf0ad6eed..5530bce9d26e 100644 --- a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py @@ -3,10 +3,9 @@ import torch -from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton from vllm.model_executor.layers.fused_moe.layer import FusedMoE -from vllm.model_executor.layers.fused_moe.router.fused_topk_bias_router import ( - fused_topk_bias, +from vllm.model_executor.layers.fused_moe.router.zero_expert_router import ( + ZeroExpertRouter, ) @@ -16,9 +15,10 @@ class ZeroExpertFusedMoE(FusedMoE): Zero experts perform identity operations (scaled pass-through) instead of full MLP computations. - This class uses memoization to avoid redundant routing computation: - routing is computed once and reused for both zero expert computation - and the main FusedMoE forward pass. + Uses a ZeroExpertRouter as the layer's main router. The router handles + routing over all experts (real + zero) with the full e_score_correction_bias, + computes zero expert contributions as a side effect, and remaps zero expert + IDs so downstream MoE computation only processes real experts. """ def __init__( @@ -28,22 +28,27 @@ def __init__( e_score_correction_bias: torch.Tensor | None = None, **kwargs, ): - # ZeroExpertFusedMoE manages its own custom_routing_function for memoization assert ( "custom_routing_function" not in kwargs or kwargs.get("custom_routing_function") is None ), ( "ZeroExpertFusedMoE does not support external custom_routing_function. " - "It manages its own for routing memoization." + "Routing is handled by ZeroExpertRouter." ) + assert "router" not in kwargs or kwargs.get("router") is None, ( + "ZeroExpertFusedMoE creates its own ZeroExpertRouter. Do not pass a router." + ) + + # Remove custom_routing_function from kwargs if present + kwargs.pop("custom_routing_function", None) + kwargs.pop("router", None) + # Slice e_score_correction_bias to only include real experts - # (not zero experts) for the base FusedMoE. The full bias will be - # used temporarily in forward() for routing. - if e_score_correction_bias is not None and "num_experts" in kwargs: - num_real_experts = kwargs["num_experts"] + # for the base FusedMoE router factory (which we'll replace anyway). + num_real_experts = kwargs["num_experts"] + if e_score_correction_bias is not None: user_bias = kwargs.get("e_score_correction_bias") - if ( user_bias is None or user_bias.shape[0] == e_score_correction_bias.shape[0] @@ -52,99 +57,48 @@ def __init__( :num_real_experts ] - # Create memoizing routing function BEFORE super().__init__() so it - # gets passed to create_fused_moe_router, which will create a - # CustomRoutingRouter that uses it. The closure captures `self` and - # accesses memoization state at call time (not definition time). - def custom_routing_function(hidden_states, gating_output, topk, renormalize): - """Return memoized `topk_weights` and `topk_ids`.""" - if self._memoized_topk_weights is None or self._memoized_topk_ids is None: - raise RuntimeError( - "ZeroExpertFusedMoE: routing results not memoized. " - "Call select_experts first to compute routing." - ) - return self._memoized_topk_weights, self._memoized_topk_ids - - kwargs["custom_routing_function"] = custom_routing_function - - # FusedMoE no longer accepts zero_expert_num/zero_expert_type. - # We handle zero experts ourselves in forward(). super().__init__(**kwargs) - # Store the actual zero_expert_num and zero_expert_type for our own use - self._actual_zero_expert_num = zero_expert_num - self._actual_zero_expert_type = zero_expert_type - # Full e_score_correction_bias (includes zero experts) - self._full_e_score_correction_bias = e_score_correction_bias - - # Expose zero_expert_num and zero_expert_type as attributes for - # compatibility with quantization methods that check these attributes - self.zero_expert_num = 0 - self.zero_expert_type = None - # Memoization state for routing results - self._memoized_topk_weights: torch.Tensor | None = None - self._memoized_topk_ids: torch.Tensor | None = None + # Replace the factory-created router with our ZeroExpertRouter. + # Uses self.eplb_state created by super().__init__() so EPLB state + # is shared between the layer and the router. + self.router = ZeroExpertRouter( + top_k=self.top_k, + global_num_experts=self.global_num_experts, + eplb_state=self.eplb_state, + e_score_correction_bias=e_score_correction_bias, + num_logical_experts=self.logical_num_experts, + zero_expert_type=zero_expert_type, + scoring_func=self.scoring_func, + renormalize=self.renormalize, + routed_scaling_factor=self.routed_scaling_factor, + enable_eplb=self.enable_eplb, + indices_type_getter=lambda: self.quant_method.topk_indices_dtype, + ) - def select_experts( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - """Route with full e_score_correction_bias (including zero experts). + # Update routing_method_type to match the new router + self.routing_method_type = self.router.routing_method_type - This bypasses self.router (which is a memoizing CustomRoutingRouter) - to perform the actual routing computation with the full bias. - """ - if self._full_e_score_correction_bias is not None: - topk_weights, topk_ids = fused_topk_bias( - hidden_states=hidden_states, - gating_output=router_logits, - e_score_correction_bias=self._full_e_score_correction_bias.data, - topk=self.top_k, - renormalize=self.renormalize, - scoring_func=self.scoring_func, - ) - else: - from vllm.model_executor.layers.fused_moe import fused_topk - - topk_weights, topk_ids = fused_topk( - hidden_states, router_logits, self.top_k, self.renormalize - ) - - if self.routed_scaling_factor != 1.0: - topk_weights = topk_weights * self.routed_scaling_factor - - return topk_weights, topk_ids - - def _compute_zero_expert_result( - self, - hidden_states: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - ) -> torch.Tensor | None: - """Compute zero expert results using pre-computed routing.""" - if ( - self._actual_zero_expert_num is None - or self._actual_zero_expert_num <= 0 - or self._actual_zero_expert_type is None - ): - return None - - return zero_experts_compute_triton( - expert_indices=topk_ids.clone(), - expert_scales=topk_weights.clone(), - num_experts=self.logical_num_experts, - zero_expert_type=self._actual_zero_expert_type, - hidden_states=hidden_states, - ) + # Re-init runner with the new router + self.runner = self._init_runner() + + # Expose zero_expert_num=0 and zero_expert_type=None for + # compatibility with quantization methods that check these attributes. + # The actual zero expert handling is done by ZeroExpertRouter. + self.zero_expert_num = 0 + self.zero_expert_type = None def forward( self, hidden_states: torch.Tensor, - router_logits: torch.Tensor, # Full logits including zero experts + router_logits: torch.Tensor, ) -> torch.Tensor: """ - Forward pass with zero expert support and routing memoization. + Forward pass with zero expert support. + + The ZeroExpertRouter handles routing with full logits (including zero + experts), computes zero expert contributions internally, and returns + masked topk_ids suitable for real expert MoE computation. Args: hidden_states: Input hidden states @@ -153,45 +107,13 @@ def forward( Returns: Combined output from real experts and zero experts """ - # Compute routing with full logits (including zero experts) so that - # zero experts can be properly identified in topk_ids. - # This bypasses self.router (the memoizing CustomRoutingRouter) and - # performs routing directly with the full e_score_correction_bias. - topk_weights, topk_ids = self.select_experts( - hidden_states=hidden_states, - router_logits=router_logits, - ) - - # Compute zero expert result if needed - zero_expert_result = self._compute_zero_expert_result( - hidden_states=hidden_states, - topk_weights=topk_weights, - topk_ids=topk_ids, - ) - - # Memoize routing results for reuse in super().forward() - self._memoized_topk_weights = topk_weights - self._memoized_topk_ids = topk_ids - - # Slice router_logits for real experts only - router_logits_sliced = router_logits[..., : self.logical_num_experts] - - # Compute real expert results (will reuse memoized routing via - # custom_routing_function) - # zero_expert_num is already 0, so FusedMoE won't handle zero experts - fused_out = super().forward( - hidden_states=hidden_states, - router_logits=router_logits_sliced, - ) - - # Combine results - # Both zero_expert_result and fused_out are computed from the same - # hidden_states, so they should be on the same device. - if zero_expert_result is not None: - fused_out = fused_out + zero_expert_result - - # Clear memoization after use - self._memoized_topk_weights = None - self._memoized_topk_ids = None + # The router handles full logits internally: routes over all experts + # (real + zero), computes zero expert output, masks zero expert IDs. + fused_out = super().forward(hidden_states, router_logits) + + # Retrieve zero expert output computed during routing + zero_expert_output = self.router.zero_expert_output + if zero_expert_output is not None: + fused_out = fused_out + zero_expert_output return fused_out From 3dd1f27a01fadcb3a0dfd070d88e51caa06f649e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 26 Feb 2026 19:17:52 -0500 Subject: [PATCH 039/191] add value test Signed-off-by: Bill Nell --- tests/kernels/moe/test_zero_expert_moe.py | 143 +++++++++++++++++++++- 1 file changed, 141 insertions(+), 2 deletions(-) diff --git a/tests/kernels/moe/test_zero_expert_moe.py b/tests/kernels/moe/test_zero_expert_moe.py index 40ac7d90741d..7713e414ea3a 100644 --- a/tests/kernels/moe/test_zero_expert_moe.py +++ b/tests/kernels/moe/test_zero_expert_moe.py @@ -5,6 +5,7 @@ Verifies that: - The ZeroExpertRouter is properly created and used as the layer router. - A forward pass through ZeroExpertFusedMoE produces correct output. +- The output decomposes correctly into real expert + zero expert contributions. """ import pytest @@ -12,6 +13,7 @@ from vllm.config import VllmConfig, set_current_vllm_config from vllm.forward_context import get_forward_context, set_forward_context +from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.fused_moe.router.zero_expert_router import ( ZeroExpertRouter, ) @@ -26,8 +28,10 @@ def zero_expert_moe(dist_init, default_vllm_config): """Create a ZeroExpertFusedMoE layer with zero experts.""" num_experts = 4 top_k = 2 - hidden_size = 128 - intermediate_size = 256 + # hidden_size must be >= 256 for the zero expert identity kernel to + # produce output (its BLOCK_SIZE=256 causes grid=0 when hidden_dim<256). + hidden_size = 256 + intermediate_size = 512 zero_expert_num = 1 e_score_correction_bias = torch.zeros( @@ -112,3 +116,138 @@ def test_zero_expert_moe_forward(zero_expert_moe, num_tokens): ) assert output.dtype == hidden_states.dtype assert not torch.isnan(output).any(), "Output contains NaN values" + + +@pytest.mark.parametrize("num_tokens", [1, 32]) +def test_zero_expert_moe_output_decomposition(zero_expert_moe, num_tokens): + """Validate that ZeroExpertFusedMoE output equals real expert output + plus zero expert contribution. + + The key invariant is: + layer.forward(h, r) == FusedMoE.forward(h, r) + zero_expert_output + + FusedMoE.forward() computes only the real expert MoE output (the + ZeroExpertRouter masks zero expert entries to weight=0), while the + zero expert contribution is computed as a side effect during routing + and added on top by ZeroExpertFusedMoE.forward(). + """ + layer, vllm_config = zero_expert_moe + num_experts = 4 + zero_expert_num = 1 + total_experts = num_experts + zero_expert_num + + hidden_states = torch.randn( + num_tokens, layer.hidden_size, dtype=torch.bfloat16, device="cuda" + ) + router_logits = torch.randn( + num_tokens, total_experts, dtype=torch.float32, device="cuda" + ) + + with torch.no_grad(): + for param in layer.parameters(): + if param.dtype.is_floating_point: + param.normal_(0, 0.01) + + with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config): + get_forward_context().all_moe_layers = None + + # Get the real expert output only (bypasses ZeroExpertFusedMoE.forward, + # calls FusedMoE.forward directly). The ZeroExpertRouter still runs and + # stores zero_expert_output as a side effect. + real_output = FusedMoE.forward(layer, hidden_states, router_logits) + zero_output = layer.router.zero_expert_output + + # Get the full combined output. + full_output = layer.forward(hidden_states, router_logits) + + assert zero_output is not None, "Zero expert output should not be None" + assert not torch.isnan(real_output).any(), "Real expert output has NaN" + assert not torch.isnan(zero_output).any(), "Zero expert output has NaN" + assert not torch.isnan(full_output).any(), "Full output has NaN" + + expected = real_output + zero_output + torch.testing.assert_close( + full_output, + expected, + atol=0, + rtol=0, + msg="ZeroExpertFusedMoE output should equal real expert output " + "plus zero expert contribution", + ) + + +@pytest.mark.parametrize("num_tokens", [1, 32]) +def test_zero_expert_moe_zero_expert_is_identity(zero_expert_moe, num_tokens): + """Validate zero expert identity behavior. + + When routing strongly favors the zero expert, its contribution should + be a scaled version of hidden_states (identity operation). We verify + this by manually computing the expected zero expert output from the + routing weights and comparing against what the router produces. + """ + layer, vllm_config = zero_expert_moe + num_experts = 4 + zero_expert_num = 1 + total_experts = num_experts + zero_expert_num + + hidden_states = torch.randn( + num_tokens, layer.hidden_size, dtype=torch.bfloat16, device="cuda" + ) + # Strongly bias toward the zero expert (index 4). + router_logits = torch.full( + (num_tokens, total_experts), -10.0, dtype=torch.float32, device="cuda" + ) + router_logits[:, num_experts] = 10.0 # zero expert gets high logit + + with torch.no_grad(): + for param in layer.parameters(): + if param.dtype.is_floating_point: + param.normal_(0, 0.01) + + with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config): + get_forward_context().all_moe_layers = None + + # Run routing to get topk_weights/topk_ids before masking. + from vllm.model_executor.layers.fused_moe.router.fused_topk_bias_router import ( + fused_topk_bias, + ) + + topk_weights, topk_ids = fused_topk_bias( + hidden_states=hidden_states, + gating_output=router_logits, + e_score_correction_bias=layer.router.e_score_correction_bias.data, + topk=layer.top_k, + renormalize=layer.router.renormalize, + scoring_func=layer.router.scoring_func, + ) + + # Manually compute expected zero expert identity output: + # For each token, sum routing weights assigned to zero expert slots, + # then multiply by hidden_states. + zero_mask = topk_ids >= num_experts + zero_weight_per_token = (topk_weights * zero_mask.float()).sum( + dim=-1, keepdim=True + ) + expected_zero_output = (hidden_states.float() * zero_weight_per_token).to( + hidden_states.dtype + ) + + # Run the layer forward to trigger routing and get the actual + # zero expert output from the router. + FusedMoE.forward(layer, hidden_states, router_logits) + actual_zero_output = layer.router.zero_expert_output + + assert actual_zero_output is not None + assert zero_mask.any(), ( + "With high zero expert logit, at least some slots should route " + "to the zero expert" + ) + + torch.testing.assert_close( + actual_zero_output, + expected_zero_output, + atol=1e-3, + rtol=1e-3, + msg="Zero expert identity output should equal " + "hidden_states * sum(zero_expert_weights)", + ) From 20832b59c6a63d980b58b2a01b31aa3222085a0a Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 27 Feb 2026 15:04:12 -0500 Subject: [PATCH 040/191] move ZeroExpertRouter construction into router factory Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 3 ++ .../layers/fused_moe/router/router_factory.py | 42 ++++++++++++++++-- .../layers/fused_moe/zero_expert_fused_moe.py | 44 ++++--------------- 3 files changed, 50 insertions(+), 39 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 0197c627af64..d07e368bd2f2 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -321,6 +321,7 @@ def __init__( shared_experts: torch.nn.Module | None = None, routed_input_transform: torch.nn.Module | None = None, router: FusedMoERouter | None = None, + zero_expert_type: str | None = None, ): super().__init__() @@ -512,6 +513,8 @@ def __init__( # TODO(bnell): once we can construct the MK at init time, we # can make this a value. indices_type_getter=lambda: self.quant_method.topk_indices_dtype, + zero_expert_type=zero_expert_type, + num_logical_experts=self.logical_num_experts, ) self.routing_method_type: RoutingMethodType = self.router.routing_method_type diff --git a/vllm/model_executor/layers/fused_moe/router/router_factory.py b/vllm/model_executor/layers/fused_moe/router/router_factory.py index 11027e894bee..42d418d7e537 100644 --- a/vllm/model_executor/layers/fused_moe/router/router_factory.py +++ b/vllm/model_executor/layers/fused_moe/router/router_factory.py @@ -25,6 +25,9 @@ from vllm.model_executor.layers.fused_moe.router.routing_simulator_router import ( RoutingSimulatorRouter, ) +from vllm.model_executor.layers.fused_moe.router.zero_expert_router import ( + ZeroExpertRouter, +) EMPTY_EPLB_STATE: EplbLayerState = EplbLayerState() @@ -49,6 +52,9 @@ def create_fused_moe_router( # eplb parameters enable_eplb: bool = False, eplb_state: EplbLayerState = EMPTY_EPLB_STATE, + # zero expert parameters + zero_expert_type: str | None = None, + num_logical_experts: int | None = None, ) -> FusedMoERouter: """ Factory function to create the appropriate FusedMoERouter subclass based on @@ -56,10 +62,11 @@ def create_fused_moe_router( The selection logic follows this priority order: 1. RoutingSimulatorRouter - if VLLM_MOE_ROUTING_SIMULATION_STRATEGY env var is set - 2. GroupedTopKRouter - if use_grouped_topk is True - 3. CustomRoutingRouter - if custom_routing_function is not None - 4. FusedTopKBiasRouter - if e_score_correction_bias is not None - 5. FusedTopKRouter - default fallback + 2. ZeroExpertRouter - if zero_expert_type is not None + 3. GroupedTopKRouter - if use_grouped_topk is True + 4. CustomRoutingRouter - if custom_routing_function is not None + 5. FusedTopKBiasRouter - if e_score_correction_bias is not None + 6. FusedTopKRouter - default fallback Common arguments: top_k: Number of experts to select per token @@ -86,6 +93,12 @@ def create_fused_moe_router( enable_eplb: Whether EPLB is enabled eplb_state: EPLB (Expert Parallelism Load Balancing) state + Zero expert arguments: + zero_expert_type: Type of zero expert (e.g. identity). If not None, + creates a ZeroExpertRouter. + num_logical_experts: Number of real (non-zero) experts. Required when + zero_expert_type is not None. + Returns: An instance of the appropriate FusedMoERouter subclass """ @@ -100,6 +113,27 @@ def create_fused_moe_router( indices_type_getter=indices_type_getter, ) + if zero_expert_type is not None: + assert num_logical_experts is not None, ( + "num_logical_experts is required when zero_expert_type is set" + ) + assert e_score_correction_bias is not None, ( + "e_score_correction_bias is required when zero_expert_type is set" + ) + return ZeroExpertRouter( + top_k=top_k, + global_num_experts=global_num_experts, + eplb_state=eplb_state, + e_score_correction_bias=e_score_correction_bias, + num_logical_experts=num_logical_experts, + zero_expert_type=zero_expert_type, + scoring_func=scoring_func, + renormalize=renormalize, + routed_scaling_factor=routed_scaling_factor, + enable_eplb=enable_eplb, + indices_type_getter=indices_type_getter, + ) + if use_grouped_topk: assert custom_routing_function is None if num_expert_group is None or topk_group is None: diff --git a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py index 5530bce9d26e..4ebfa0d3637a 100644 --- a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py @@ -4,9 +4,6 @@ import torch from vllm.model_executor.layers.fused_moe.layer import FusedMoE -from vllm.model_executor.layers.fused_moe.router.zero_expert_router import ( - ZeroExpertRouter, -) class ZeroExpertFusedMoE(FusedMoE): @@ -44,43 +41,20 @@ def __init__( kwargs.pop("custom_routing_function", None) kwargs.pop("router", None) - # Slice e_score_correction_bias to only include real experts - # for the base FusedMoE router factory (which we'll replace anyway). + # Pass the full e_score_correction_bias (real + zero experts) and + # zero_expert_type through to the router factory, which will create + # a ZeroExpertRouter. num_real_experts = kwargs["num_experts"] if e_score_correction_bias is not None: - user_bias = kwargs.get("e_score_correction_bias") - if ( - user_bias is None - or user_bias.shape[0] == e_score_correction_bias.shape[0] - ): - kwargs["e_score_correction_bias"] = e_score_correction_bias[ - :num_real_experts - ] + kwargs["e_score_correction_bias"] = e_score_correction_bias + kwargs["zero_expert_type"] = zero_expert_type super().__init__(**kwargs) - # Replace the factory-created router with our ZeroExpertRouter. - # Uses self.eplb_state created by super().__init__() so EPLB state - # is shared between the layer and the router. - self.router = ZeroExpertRouter( - top_k=self.top_k, - global_num_experts=self.global_num_experts, - eplb_state=self.eplb_state, - e_score_correction_bias=e_score_correction_bias, - num_logical_experts=self.logical_num_experts, - zero_expert_type=zero_expert_type, - scoring_func=self.scoring_func, - renormalize=self.renormalize, - routed_scaling_factor=self.routed_scaling_factor, - enable_eplb=self.enable_eplb, - indices_type_getter=lambda: self.quant_method.topk_indices_dtype, - ) - - # Update routing_method_type to match the new router - self.routing_method_type = self.router.routing_method_type - - # Re-init runner with the new router - self.runner = self._init_runner() + # Fix self.e_score_correction_bias to only cover real experts, + # for compatibility with monolithic kernels that read it directly. + if e_score_correction_bias is not None: + self.e_score_correction_bias = e_score_correction_bias[:num_real_experts] # Expose zero_expert_num=0 and zero_expert_type=None for # compatibility with quantization methods that check these attributes. From d5676bdb24ef72c660dbaf333a27a76154a4498f Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 27 Feb 2026 15:34:13 -0500 Subject: [PATCH 041/191] move zero expert handling into MoERunnerBase Signed-off-by: Bill Nell --- tests/kernels/moe/test_zero_expert_moe.py | 45 +++++-------------- .../fused_moe/runner/moe_runner_base.py | 20 ++++++++- .../layers/fused_moe/zero_expert_fused_moe.py | 30 ------------- 3 files changed, 31 insertions(+), 64 deletions(-) diff --git a/tests/kernels/moe/test_zero_expert_moe.py b/tests/kernels/moe/test_zero_expert_moe.py index 7713e414ea3a..60675e50d56f 100644 --- a/tests/kernels/moe/test_zero_expert_moe.py +++ b/tests/kernels/moe/test_zero_expert_moe.py @@ -13,7 +13,6 @@ from vllm.config import VllmConfig, set_current_vllm_config from vllm.forward_context import get_forward_context, set_forward_context -from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.fused_moe.router.zero_expert_router import ( ZeroExpertRouter, ) @@ -119,18 +118,9 @@ def test_zero_expert_moe_forward(zero_expert_moe, num_tokens): @pytest.mark.parametrize("num_tokens", [1, 32]) -def test_zero_expert_moe_output_decomposition(zero_expert_moe, num_tokens): - """Validate that ZeroExpertFusedMoE output equals real expert output - plus zero expert contribution. - - The key invariant is: - layer.forward(h, r) == FusedMoE.forward(h, r) + zero_expert_output - - FusedMoE.forward() computes only the real expert MoE output (the - ZeroExpertRouter masks zero expert entries to weight=0), while the - zero expert contribution is computed as a side effect during routing - and added on top by ZeroExpertFusedMoE.forward(). - """ +def test_zero_expert_moe_output_deterministic(zero_expert_moe, num_tokens): + """Validate that two forward calls with the same input produce + identical output (determinism).""" layer, vllm_config = zero_expert_moe num_experts = 4 zero_expert_num = 1 @@ -151,28 +141,17 @@ def test_zero_expert_moe_output_decomposition(zero_expert_moe, num_tokens): with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config): get_forward_context().all_moe_layers = None - # Get the real expert output only (bypasses ZeroExpertFusedMoE.forward, - # calls FusedMoE.forward directly). The ZeroExpertRouter still runs and - # stores zero_expert_output as a side effect. - real_output = FusedMoE.forward(layer, hidden_states, router_logits) - zero_output = layer.router.zero_expert_output - - # Get the full combined output. - full_output = layer.forward(hidden_states, router_logits) + output1 = layer.forward(hidden_states, router_logits) + output2 = layer.forward(hidden_states, router_logits) - assert zero_output is not None, "Zero expert output should not be None" - assert not torch.isnan(real_output).any(), "Real expert output has NaN" - assert not torch.isnan(zero_output).any(), "Zero expert output has NaN" - assert not torch.isnan(full_output).any(), "Full output has NaN" + assert not torch.isnan(output1).any(), "Output contains NaN" - expected = real_output + zero_output torch.testing.assert_close( - full_output, - expected, + output1, + output2, atol=0, rtol=0, - msg="ZeroExpertFusedMoE output should equal real expert output " - "plus zero expert contribution", + msg="Two forward calls with same input should produce identical output", ) @@ -232,9 +211,9 @@ def test_zero_expert_moe_zero_expert_is_identity(zero_expert_moe, num_tokens): hidden_states.dtype ) - # Run the layer forward to trigger routing and get the actual - # zero expert output from the router. - FusedMoE.forward(layer, hidden_states, router_logits) + # Run routing directly to trigger zero expert computation + # without going through the runner (which consumes the output). + layer.router.select_experts(hidden_states, router_logits) actual_zero_output = layer.router.zero_expert_output assert actual_zero_output is not None diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index 078085d23f2e..8cd24c2e2316 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -26,6 +26,9 @@ from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( FusedMoERouter, ) +from vllm.model_executor.layers.fused_moe.router.zero_expert_router import ( + ZeroExpertRouter, +) from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( SharedExperts, @@ -420,6 +423,19 @@ def _maybe_overlap_gate_with_shared_experts( return router_logits + def _maybe_add_zero_expert_output( + self, + result: torch.Tensor | tuple[torch.Tensor, torch.Tensor], + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if isinstance(self.router, ZeroExpertRouter): + zero_expert_output = self.router.zero_expert_output + assert zero_expert_output is not None + if isinstance(result, tuple): + result = (result[0], result[1] + zero_expert_output) + else: + result = result + zero_expert_output + return result + def forward( self, hidden_states: torch.Tensor, @@ -471,7 +487,9 @@ def forward( self._encode_layer_name(), ) - return self._maybe_reduce_output(fused_output, og_hidden_dims) + result = self._maybe_reduce_output(fused_output, og_hidden_dims) + + return self._maybe_add_zero_expert_output(result) def forward_dispatch( self, diff --git a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py index 4ebfa0d3637a..4a6f2a5500c4 100644 --- a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py @@ -61,33 +61,3 @@ def __init__( # The actual zero expert handling is done by ZeroExpertRouter. self.zero_expert_num = 0 self.zero_expert_type = None - - def forward( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - ) -> torch.Tensor: - """ - Forward pass with zero expert support. - - The ZeroExpertRouter handles routing with full logits (including zero - experts), computes zero expert contributions internally, and returns - masked topk_ids suitable for real expert MoE computation. - - Args: - hidden_states: Input hidden states - router_logits: Full router logits (including zero experts) - - Returns: - Combined output from real experts and zero experts - """ - # The router handles full logits internally: routes over all experts - # (real + zero), computes zero expert output, masks zero expert IDs. - fused_out = super().forward(hidden_states, router_logits) - - # Retrieve zero expert output computed during routing - zero_expert_output = self.router.zero_expert_output - if zero_expert_output is not None: - fused_out = fused_out + zero_expert_output - - return fused_out From 165cfe6219e276c331b415153da0c9add3150aea Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 27 Feb 2026 15:43:55 -0500 Subject: [PATCH 042/191] slightly improved test Signed-off-by: Bill Nell --- tests/kernels/moe/test_zero_expert_moe.py | 47 ++++++++++++++++++----- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/tests/kernels/moe/test_zero_expert_moe.py b/tests/kernels/moe/test_zero_expert_moe.py index 60675e50d56f..1c34cfe32a28 100644 --- a/tests/kernels/moe/test_zero_expert_moe.py +++ b/tests/kernels/moe/test_zero_expert_moe.py @@ -118,9 +118,17 @@ def test_zero_expert_moe_forward(zero_expert_moe, num_tokens): @pytest.mark.parametrize("num_tokens", [1, 32]) -def test_zero_expert_moe_output_deterministic(zero_expert_moe, num_tokens): - """Validate that two forward calls with the same input produce - identical output (determinism).""" +def test_zero_expert_moe_output_decomposition(zero_expert_moe, num_tokens): + """Validate that the layer output equals real expert output plus zero + expert contribution. + + The key invariant is: + layer.forward(h, r) == quant_method.apply(routing) + zero_expert_output + + We compute routing and zero expert output via router.select_experts(), + then compute real expert output via quant_method.apply() directly, and + verify that the layer forward produces their sum. + """ layer, vllm_config = zero_expert_moe num_experts = 4 zero_expert_num = 1 @@ -141,17 +149,38 @@ def test_zero_expert_moe_output_deterministic(zero_expert_moe, num_tokens): with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config): get_forward_context().all_moe_layers = None - output1 = layer.forward(hidden_states, router_logits) - output2 = layer.forward(hidden_states, router_logits) + # Compute routing and zero expert output directly (without the + # runner consuming zero_expert_output). + topk_weights, topk_ids = layer.router.select_experts( + hidden_states, router_logits + ) + zero_output = layer.router.zero_expert_output + + # Compute real expert output via quant_method.apply(). + real_output = layer.quant_method.apply( + layer=layer, + x=hidden_states, + topk_weights=topk_weights, + topk_ids=topk_ids, + shared_experts_input=None, + ) + + # Get the combined output from layer.forward(). + full_output = layer.forward(hidden_states, router_logits) - assert not torch.isnan(output1).any(), "Output contains NaN" + assert zero_output is not None, "Zero expert output should not be None" + assert not torch.isnan(real_output).any(), "Real expert output has NaN" + assert not torch.isnan(zero_output).any(), "Zero expert output has NaN" + assert not torch.isnan(full_output).any(), "Full output has NaN" + expected = real_output + zero_output torch.testing.assert_close( - output1, - output2, + full_output, + expected, atol=0, rtol=0, - msg="Two forward calls with same input should produce identical output", + msg="Layer output should equal real expert output " + "plus zero expert contribution", ) From a7ac4c4ee2326c8b0386656f66d828b07f1e05b3 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 27 Feb 2026 16:00:34 -0500 Subject: [PATCH 043/191] simplifications Signed-off-by: Bill Nell --- tests/kernels/moe/test_zero_expert_moe.py | 1 - vllm/model_executor/layers/fused_moe/layer.py | 47 ++++++++----------- .../layers/fused_moe/zero_expert_fused_moe.py | 7 --- vllm/model_executor/models/longcat_flash.py | 2 - 4 files changed, 20 insertions(+), 37 deletions(-) diff --git a/tests/kernels/moe/test_zero_expert_moe.py b/tests/kernels/moe/test_zero_expert_moe.py index 1c34cfe32a28..811bd35a869c 100644 --- a/tests/kernels/moe/test_zero_expert_moe.py +++ b/tests/kernels/moe/test_zero_expert_moe.py @@ -46,7 +46,6 @@ def zero_expert_moe(dist_init, default_vllm_config): init_workspace_manager(torch.cuda.current_device()) layer = ZeroExpertFusedMoE( - zero_expert_num=zero_expert_num, zero_expert_type="identity", e_score_correction_bias=e_score_correction_bias, num_experts=num_experts, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index d07e368bd2f2..d3b888bee126 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -36,9 +36,6 @@ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( init_aiter_topK_meta_data, ) -from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( - FusedMoERouter, -) from vllm.model_executor.layers.fused_moe.router.router_factory import ( create_fused_moe_router, ) @@ -320,7 +317,6 @@ def __init__( gate: torch.nn.Module | None = None, shared_experts: torch.nn.Module | None = None, routed_input_transform: torch.nn.Module | None = None, - router: FusedMoERouter | None = None, zero_expert_type: str | None = None, ): super().__init__() @@ -493,29 +489,26 @@ def __init__( # TODO(bnell): we should not have to create a router if the kernel is # monolithic. - if router is not None: - self.router = router - else: - self.router = create_fused_moe_router( - top_k=top_k, - global_num_experts=self.global_num_experts, - eplb_state=self.eplb_state, - renormalize=renormalize, - use_grouped_topk=use_grouped_topk, - num_expert_group=num_expert_group, - topk_group=topk_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - num_fused_shared_experts=self.num_fused_shared_experts, - enable_eplb=enable_eplb, - # TODO(bnell): once we can construct the MK at init time, we - # can make this a value. - indices_type_getter=lambda: self.quant_method.topk_indices_dtype, - zero_expert_type=zero_expert_type, - num_logical_experts=self.logical_num_experts, - ) + self.router = create_fused_moe_router( + top_k=top_k, + global_num_experts=self.global_num_experts, + eplb_state=self.eplb_state, + renormalize=renormalize, + use_grouped_topk=use_grouped_topk, + num_expert_group=num_expert_group, + topk_group=topk_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + num_fused_shared_experts=self.num_fused_shared_experts, + enable_eplb=enable_eplb, + # TODO(bnell): once we can construct the MK at init time, we + # can make this a value. + indices_type_getter=lambda: self.quant_method.topk_indices_dtype, + zero_expert_type=zero_expert_type, + num_logical_experts=self.logical_num_experts, + ) self.routing_method_type: RoutingMethodType = self.router.routing_method_type # Round up hidden size before creating moe_config. diff --git a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py index 4a6f2a5500c4..59781f75ccff 100644 --- a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py @@ -20,7 +20,6 @@ class ZeroExpertFusedMoE(FusedMoE): def __init__( self, - zero_expert_num: int, zero_expert_type: str, e_score_correction_bias: torch.Tensor | None = None, **kwargs, @@ -55,9 +54,3 @@ def __init__( # for compatibility with monolithic kernels that read it directly. if e_score_correction_bias is not None: self.e_score_correction_bias = e_score_correction_bias[:num_real_experts] - - # Expose zero_expert_num=0 and zero_expert_type=None for - # compatibility with quantization methods that check these attributes. - # The actual zero expert handling is done by ZeroExpertRouter. - self.zero_expert_num = 0 - self.zero_expert_type = None diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py index b695e062d276..ddd46a2c4382 100644 --- a/vllm/model_executor/models/longcat_flash.py +++ b/vllm/model_executor/models/longcat_flash.py @@ -292,10 +292,8 @@ def __init__( prefix=f"{prefix}.gate", ) - assert config.zero_expert_num is not None assert config.zero_expert_type is not None self.experts = ZeroExpertFusedMoE( - zero_expert_num=config.zero_expert_num, zero_expert_type=config.zero_expert_type, e_score_correction_bias=self.router.e_score_correction_bias, num_experts=num_experts, From dabcd5f0d2a950056068c5c951f4d6f921949b86 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 27 Feb 2026 16:21:48 -0500 Subject: [PATCH 044/191] better test Signed-off-by: Bill Nell --- tests/kernels/moe/test_zero_expert_moe.py | 49 +++++++++++++++++------ 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/tests/kernels/moe/test_zero_expert_moe.py b/tests/kernels/moe/test_zero_expert_moe.py index 811bd35a869c..066506ec8420 100644 --- a/tests/kernels/moe/test_zero_expert_moe.py +++ b/tests/kernels/moe/test_zero_expert_moe.py @@ -13,6 +13,7 @@ from vllm.config import VllmConfig, set_current_vllm_config from vllm.forward_context import get_forward_context, set_forward_context +from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.fused_moe.router.zero_expert_router import ( ZeroExpertRouter, ) @@ -118,15 +119,16 @@ def test_zero_expert_moe_forward(zero_expert_moe, num_tokens): @pytest.mark.parametrize("num_tokens", [1, 32]) def test_zero_expert_moe_output_decomposition(zero_expert_moe, num_tokens): - """Validate that the layer output equals real expert output plus zero - expert contribution. + """Validate that the ZeroExpertFusedMoE output equals a plain FusedMoE + output (real experts only) plus the zero expert contribution. The key invariant is: - layer.forward(h, r) == quant_method.apply(routing) + zero_expert_output + zero_layer.forward(h, r_full) == plain_layer.forward(h, r_real) + + zero_expert_output - We compute routing and zero expert output via router.select_experts(), - then compute real expert output via quant_method.apply() directly, and - verify that the layer forward produces their sum. + We create a plain FusedMoE layer with the same weights and real-expert-only + router logits, compute the zero expert output via the ZeroExpertRouter, and + verify the sum matches the ZeroExpertFusedMoE output. """ layer, vllm_config = zero_expert_moe num_experts = 4 @@ -148,23 +150,44 @@ def test_zero_expert_moe_output_decomposition(zero_expert_moe, num_tokens): with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config): get_forward_context().all_moe_layers = None - # Compute routing and zero expert output directly (without the - # runner consuming zero_expert_output). + # Create a plain FusedMoE layer with the same config but no zero + # experts. Use a separate prefix to avoid collision. + plain_layer = FusedMoE( + num_experts=num_experts, + top_k=layer.top_k, + hidden_size=layer.hidden_size, + intermediate_size=layer.intermediate_size_per_partition, + params_dtype=torch.bfloat16, + prefix="test_zero_expert_moe_plain", + renormalize=False, + scoring_func="softmax", + e_score_correction_bias=layer.e_score_correction_bias, + ).cuda() + + # Share weights from the zero expert layer. + plain_layer.w13_weight.data.copy_(layer.w13_weight.data) + plain_layer.w2_weight.data.copy_(layer.w2_weight.data) + plain_layer.quant_method.process_weights_after_loading(plain_layer) + + # Compute routing via the ZeroExpertRouter. This produces masked + # topk_weights/topk_ids (zero expert entries have weight=0, id=0) + # and stores zero_expert_output as a side effect. topk_weights, topk_ids = layer.router.select_experts( hidden_states, router_logits ) zero_output = layer.router.zero_expert_output - # Compute real expert output via quant_method.apply(). - real_output = layer.quant_method.apply( - layer=layer, + # Compute real expert output using the plain layer with the masked + # routing from the ZeroExpertRouter. + real_output = plain_layer.quant_method.apply( + layer=plain_layer, x=hidden_states, topk_weights=topk_weights, topk_ids=topk_ids, shared_experts_input=None, ) - # Get the combined output from layer.forward(). + # Get the combined output from the zero expert layer. full_output = layer.forward(hidden_states, router_logits) assert zero_output is not None, "Zero expert output should not be None" @@ -178,7 +201,7 @@ def test_zero_expert_moe_output_decomposition(zero_expert_moe, num_tokens): expected, atol=0, rtol=0, - msg="Layer output should equal real expert output " + msg="ZeroExpertFusedMoE output should equal plain FusedMoE output " "plus zero expert contribution", ) From 3b950b01972710b79840749bdcb9bb0380fd548e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 27 Feb 2026 16:33:41 -0500 Subject: [PATCH 045/191] remove ZeroExpertFusedMoE Signed-off-by: Bill Nell --- tests/kernels/moe/test_zero_expert_moe.py | 21 +++---- .../layers/fused_moe/__init__.py | 4 -- vllm/model_executor/layers/fused_moe/layer.py | 8 +++ .../layers/fused_moe/zero_expert_fused_moe.py | 56 ------------------- vllm/model_executor/models/longcat_flash.py | 6 +- 5 files changed, 20 insertions(+), 75 deletions(-) delete mode 100644 vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py diff --git a/tests/kernels/moe/test_zero_expert_moe.py b/tests/kernels/moe/test_zero_expert_moe.py index 066506ec8420..92507eabf22f 100644 --- a/tests/kernels/moe/test_zero_expert_moe.py +++ b/tests/kernels/moe/test_zero_expert_moe.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests for ZeroExpertFusedMoE. +"""Tests for FusedMoE with zero experts. Verifies that: - The ZeroExpertRouter is properly created and used as the layer router. -- A forward pass through ZeroExpertFusedMoE produces correct output. +- A forward pass through FusedMoE with zero experts produces correct output. - The output decomposes correctly into real expert + zero expert contributions. """ @@ -17,15 +17,12 @@ from vllm.model_executor.layers.fused_moe.router.zero_expert_router import ( ZeroExpertRouter, ) -from vllm.model_executor.layers.fused_moe.zero_expert_fused_moe import ( - ZeroExpertFusedMoE, -) from vllm.v1.worker.workspace import init_workspace_manager @pytest.fixture def zero_expert_moe(dist_init, default_vllm_config): - """Create a ZeroExpertFusedMoE layer with zero experts.""" + """Create a FusedMoE layer with zero experts.""" num_experts = 4 top_k = 2 # hidden_size must be >= 256 for the zero expert identity kernel to @@ -46,7 +43,7 @@ def zero_expert_moe(dist_init, default_vllm_config): with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config): init_workspace_manager(torch.cuda.current_device()) - layer = ZeroExpertFusedMoE( + layer = FusedMoE( zero_expert_type="identity", e_score_correction_bias=e_score_correction_bias, num_experts=num_experts, @@ -67,7 +64,7 @@ def zero_expert_moe(dist_init, default_vllm_config): @pytest.mark.parametrize("num_tokens", [1, 32]) def test_zero_expert_moe_router_is_zero_expert_router(zero_expert_moe, num_tokens): - """Verify that ZeroExpertFusedMoE creates a ZeroExpertRouter.""" + """Verify that FusedMoE with zero_expert_type creates a ZeroExpertRouter.""" layer, _ = zero_expert_moe assert isinstance(layer.router, ZeroExpertRouter), ( f"Expected ZeroExpertRouter but got {type(layer.router).__name__}." @@ -84,7 +81,7 @@ def test_zero_expert_moe_no_custom_routing_fn(zero_expert_moe, num_tokens): @pytest.mark.parametrize("num_tokens", [1, 32]) def test_zero_expert_moe_forward(zero_expert_moe, num_tokens): - """Run a forward pass through ZeroExpertFusedMoE and verify output shape.""" + """Run a forward pass through FusedMoE with zero experts and verify output shape.""" layer, vllm_config = zero_expert_moe hidden_size = layer.hidden_size @@ -119,7 +116,7 @@ def test_zero_expert_moe_forward(zero_expert_moe, num_tokens): @pytest.mark.parametrize("num_tokens", [1, 32]) def test_zero_expert_moe_output_decomposition(zero_expert_moe, num_tokens): - """Validate that the ZeroExpertFusedMoE output equals a plain FusedMoE + """Validate that the FusedMoE output equals a plain FusedMoE output (real experts only) plus the zero expert contribution. The key invariant is: @@ -128,7 +125,7 @@ def test_zero_expert_moe_output_decomposition(zero_expert_moe, num_tokens): We create a plain FusedMoE layer with the same weights and real-expert-only router logits, compute the zero expert output via the ZeroExpertRouter, and - verify the sum matches the ZeroExpertFusedMoE output. + verify the sum matches the FusedMoE output. """ layer, vllm_config = zero_expert_moe num_experts = 4 @@ -201,7 +198,7 @@ def test_zero_expert_moe_output_decomposition(zero_expert_moe, num_tokens): expected, atol=0, rtol=0, - msg="ZeroExpertFusedMoE output should equal plain FusedMoE output " + msg="FusedMoE output should equal plain FusedMoE output " "plus zero expert contribution", ) diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index f56a2e63bf40..a28fd06930c6 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -33,9 +33,6 @@ from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( UnquantizedFusedMoEMethod, ) -from vllm.model_executor.layers.fused_moe.zero_expert_fused_moe import ( - ZeroExpertFusedMoE, -) from vllm.triton_utils import HAS_TRITON _config: dict[str, Any] | None = None @@ -68,7 +65,6 @@ def get_config() -> dict[str, Any] | None: "GateLinear", "RoutingMethodType", "SharedFusedMoE", - "ZeroExpertFusedMoE", "activation_without_mul", "apply_moe_activation", "override_config", diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index d3b888bee126..43e4b124764e 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -511,6 +511,14 @@ def __init__( ) self.routing_method_type: RoutingMethodType = self.router.routing_method_type + # When using zero experts, slice e_score_correction_bias to cover + # only real experts, for compatibility with monolithic kernels that + # read it directly. + if zero_expert_type is not None and e_score_correction_bias is not None: + self.e_score_correction_bias = e_score_correction_bias[ + : self.logical_num_experts + ] + # Round up hidden size before creating moe_config. # This way moe_config is created with the correct hidden_size from the start. unpadded_hidden_size = hidden_size diff --git a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py deleted file mode 100644 index 59781f75ccff..000000000000 --- a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py +++ /dev/null @@ -1,56 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -from vllm.model_executor.layers.fused_moe.layer import FusedMoE - - -class ZeroExpertFusedMoE(FusedMoE): - """ - A FusedMoE operation that also computes the results of zero experts. - Zero experts perform identity operations (scaled pass-through) instead - of full MLP computations. - - Uses a ZeroExpertRouter as the layer's main router. The router handles - routing over all experts (real + zero) with the full e_score_correction_bias, - computes zero expert contributions as a side effect, and remaps zero expert - IDs so downstream MoE computation only processes real experts. - """ - - def __init__( - self, - zero_expert_type: str, - e_score_correction_bias: torch.Tensor | None = None, - **kwargs, - ): - assert ( - "custom_routing_function" not in kwargs - or kwargs.get("custom_routing_function") is None - ), ( - "ZeroExpertFusedMoE does not support external custom_routing_function. " - "Routing is handled by ZeroExpertRouter." - ) - - assert "router" not in kwargs or kwargs.get("router") is None, ( - "ZeroExpertFusedMoE creates its own ZeroExpertRouter. Do not pass a router." - ) - - # Remove custom_routing_function from kwargs if present - kwargs.pop("custom_routing_function", None) - kwargs.pop("router", None) - - # Pass the full e_score_correction_bias (real + zero experts) and - # zero_expert_type through to the router factory, which will create - # a ZeroExpertRouter. - num_real_experts = kwargs["num_experts"] - if e_score_correction_bias is not None: - kwargs["e_score_correction_bias"] = e_score_correction_bias - kwargs["zero_expert_type"] = zero_expert_type - - super().__init__(**kwargs) - - # Fix self.e_score_correction_bias to only cover real experts, - # for compatibility with monolithic kernels that read it directly. - if e_score_correction_bias is not None: - self.e_score_correction_bias = e_score_correction_bias[:num_real_experts] diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py index ddd46a2c4382..375b0b69b1f9 100644 --- a/vllm/model_executor/models/longcat_flash.py +++ b/vllm/model_executor/models/longcat_flash.py @@ -46,7 +46,7 @@ from vllm.distributed import get_pp_group from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe import FusedMoE, ZeroExpertFusedMoE +from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -293,7 +293,7 @@ def __init__( ) assert config.zero_expert_type is not None - self.experts = ZeroExpertFusedMoE( + self.experts = FusedMoE( zero_expert_type=config.zero_expert_type, e_score_correction_bias=self.router.e_score_correction_bias, num_experts=num_experts, @@ -330,7 +330,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states_padded.to(self.router_params_dtype) ) - # ZeroExpertFusedMoE handles routing memoization and zero expert computation + # FusedMoE handles routing memoization and zero expert computation # internally. Pass full router_logits (including zero experts) so that # zero experts can be properly identified in routing. final_hidden_states = self.experts( From 8f889913e60d30d766933e57b4a071ff2de3ed8d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 2 Mar 2026 08:26:28 -0500 Subject: [PATCH 046/191] Add comment Signed-off-by: Bill Nell --- tests/kernels/moe/test_zero_expert_moe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/kernels/moe/test_zero_expert_moe.py b/tests/kernels/moe/test_zero_expert_moe.py index 92507eabf22f..24edca00b6af 100644 --- a/tests/kernels/moe/test_zero_expert_moe.py +++ b/tests/kernels/moe/test_zero_expert_moe.py @@ -6,6 +6,8 @@ - The ZeroExpertRouter is properly created and used as the layer router. - A forward pass through FusedMoE with zero experts produces correct output. - The output decomposes correctly into real expert + zero expert contributions. + +Note: tests generated with Claude. """ import pytest From 3a9852b13ed7832ea652da3c380b9780b8e46944 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 18 Mar 2026 19:06:20 +0000 Subject: [PATCH 047/191] fix lint Signed-off-by: Bill Nell --- tests/kernels/moe/test_zero_expert_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/kernels/moe/test_zero_expert_moe.py b/tests/kernels/moe/test_zero_expert_moe.py index 24edca00b6af..d8f900256ec3 100644 --- a/tests/kernels/moe/test_zero_expert_moe.py +++ b/tests/kernels/moe/test_zero_expert_moe.py @@ -43,7 +43,7 @@ def zero_expert_moe(dist_init, default_vllm_config): vllm_config.compilation_config.static_forward_context = dict() with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config): - init_workspace_manager(torch.cuda.current_device()) + init_workspace_manager(torch.accelerator.current_device_index()) layer = FusedMoE( zero_expert_type="identity", From b94eacac0caffc0dd3c0a83cb162c147d4e04a3e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 3 Mar 2026 14:12:29 -0500 Subject: [PATCH 048/191] move shared expert all gather to SharedExperts Signed-off-by: Bill Nell --- .../fused_moe/runner/moe_runner_base.py | 5 ++-- .../layers/fused_moe/runner/shared_experts.py | 25 ++++++------------- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index 8cd24c2e2316..d673bad626d2 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -300,8 +300,9 @@ def reduce_and_trunc(x: torch.Tensor, trunc_size: int) -> torch.Tensor: func = trunc if isinstance(states, tuple): - return tuple( - [func(s, trunc_size) for s, trunc_size in zip(states, trunc_sizes)] + return ( + trunc(states[0], trunc_sizes[0]), # shared: already reduced + func(states[1], trunc_sizes[1]), # fused: reduce here ) else: assert len(trunc_sizes) == 1 diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index c4333bcf9f19..fb3d45678013 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -161,23 +161,22 @@ def _run_in_aux_stream( return output - def _maybe_reduce_shared_out(self, shared_out: torch.Tensor) -> torch.Tensor: - # Reduce shared expert outputs if necessary, since the MLP - # should have been created with reduce_results=False. + def _maybe_reduce_shared_output(self, output: torch.Tensor) -> torch.Tensor: if ( - self._reduce_results - and self._quant_method.moe_kernel is not None + self._quant_method.moe_kernel is not None and self._quant_method.moe_kernel.output_is_reduced() and get_tensor_model_parallel_world_size() > 1 ): - shared_out = tensor_model_parallel_all_reduce(shared_out) - return shared_out + output = tensor_model_parallel_all_reduce(output) + return output @property def output(self) -> torch.Tensor: assert self._output is not None output = self._output self._output = None + if output is not None: + output = self._maybe_reduce_shared_output(output) return output def apply( @@ -195,14 +194,4 @@ def apply( if order == SharedExpertsOrder.MULTI_STREAM_OVERLAPPED: self._output = self._run_in_aux_stream(shared_experts_input) else: - self._output = self._layer(shared_experts_input) - - if order == SharedExpertsOrder.EXTERNAL: - # TODO: figure out how to combine this with maybe_reduce_output? - # or get rid of it completely. - assert self._output is not None - self._output = self._maybe_reduce_shared_out(self._output) - - assert self._output is not None - - # TODO(bnell): potentially do AFTER reduce here instead of in runner. + self._output = self._shared_experts(shared_experts_input) From 03de451b5f863f3f214cd21a703a643f18da45e0 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 3 Mar 2026 14:22:50 -0500 Subject: [PATCH 049/191] remove must_reduce_shared_expert_outputs external method Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 15 ------- .../layers/fused_moe/runner/moe_runner.py | 4 -- .../fused_moe/runner/moe_runner_base.py | 4 +- vllm/model_executor/models/exaone_moe.py | 40 +++++++++---------- 4 files changed, 22 insertions(+), 41 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 43e4b124764e..66a7eb6b85d3 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1505,21 +1505,6 @@ def moe_quant_config(self) -> FusedMoEQuantConfig | None: self.ensure_moe_quant_config_init() return self.quant_method.moe_quant_config - def must_reduce_shared_expert_outputs(self) -> bool: - """ - The shared_experts are typically computed using the RowParallelLinear - layer. The result of this function is typically used as - the reduce_results argument to the module. - When just tensor-parallel is used, it is not required to reduce - the shared_experts results immediately. Instead we reduce at the - once at the end of the MoE op. (Refer to DeepSeekV2MoE module) - With EP and all2all kernels - this is no longer viable as all - GPU ranks in DP, produce the complete set of hidden_states. - Therefore it is required that we reduce the shared_experts output - early. - """ - return self.runner.must_reduce_shared_expert_outputs() - def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tensor): """ Some combine kernels reduce across GPU ranks by default. diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index 720e997cda36..cb0539186a67 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -22,10 +22,6 @@ def forward( ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: raise NotImplementedError - @abstractmethod - def must_reduce_shared_expert_outputs(self) -> bool: - raise NotImplementedError - @abstractmethod def maybe_all_reduce_tensor_model_parallel( self, diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index d673bad626d2..b7f01ee98037 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -223,7 +223,7 @@ def is_internal_router(self) -> bool: def reduce_results(self) -> bool: raise NotImplementedError - def must_reduce_shared_expert_outputs(self) -> bool: + def _must_reduce_shared_expert_outputs(self) -> bool: """ The shared_experts are typically computed using the RowParallelLinear layer. The result of this function is typically used as @@ -245,7 +245,7 @@ def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tens """ Some combine kernels reduce across GPU ranks by default. """ - if self.must_reduce_shared_expert_outputs(): + if self._must_reduce_shared_expert_outputs(): return final_hidden_states else: return tensor_model_parallel_all_reduce(final_hidden_states) diff --git a/vllm/model_executor/models/exaone_moe.py b/vllm/model_executor/models/exaone_moe.py index d7282edcf4f6..66d05e095ac5 100644 --- a/vllm/model_executor/models/exaone_moe.py +++ b/vllm/model_executor/models/exaone_moe.py @@ -31,6 +31,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -116,7 +117,22 @@ def __init__( self.physical_expert_start + self.n_local_physical_experts ) - self.experts = FusedMoE( + if getattr(config, "num_shared_experts", 0) > 0: + intermediate_size = config.moe_intermediate_size * config.num_shared_experts + self.shared_experts = ExaoneMoeGatedMLP( + hidden_size=config.hidden_size, + intermediate_size=intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=False, + prefix=f"{prefix}.shared_experts", + ) + else: + self.shared_experts = None + + self.experts = SharedFusedMoE( + shared_experts=self.shared_experts, + gate=self.gate, num_experts=self.n_routed_experts, top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, @@ -135,34 +151,18 @@ def __init__( num_redundant_experts=self.n_redundant_experts, ) - if getattr(config, "num_shared_experts", 0) > 0: - intermediate_size = config.moe_intermediate_size * config.num_shared_experts - self.shared_experts = ExaoneMoeGatedMLP( - hidden_size=config.hidden_size, - intermediate_size=intermediate_size, - hidden_act=config.hidden_act, - quant_config=quant_config, - reduce_results=self.experts.must_reduce_shared_expert_outputs(), - prefix=f"{prefix}.shared_experts", - ) - else: - self.shared_experts = None - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # NOTE: hidden_states can have either 1D or 2D shape. orig_shape = hidden_states.shape hidden_dim = hidden_states.shape[-1] hidden_states = hidden_states.view(-1, hidden_dim) - # router_logits: (num_tokens, n_experts) - router_logits, _ = self.gate(hidden_states) - - final_hidden_states = self.experts( - hidden_states=hidden_states, router_logits=router_logits + shared_output, final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=hidden_states ) if self.shared_experts is not None: - shared_output = self.shared_experts(hidden_states) + assert shared_output is not None final_hidden_states = final_hidden_states + shared_output if self.tp_size > 1: From d43fa50e298c1bc13dc5623e496e9f6098361494 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 3 Mar 2026 18:49:56 -0500 Subject: [PATCH 050/191] wip moving experts epilog into MoERunnerBase Signed-off-by: Bill Nell --- .../test_shared_fused_moe_routed_transform.py | 29 ++---- vllm/forward_context.py | 2 +- vllm/lora/layers/fused_moe.py | 3 - vllm/model_executor/layers/fused_moe/layer.py | 22 ++--- .../fused_moe/runner/chunking_moe_runner.py | 7 +- .../fused_moe/runner/default_moe_runner.py | 10 +-- .../layers/fused_moe/runner/moe_runner.py | 9 +- .../fused_moe/runner/moe_runner_base.py | 89 +++++++++---------- .../fused_moe/runner/moe_runner_factory.py | 6 +- .../layers/fused_moe/runner/shared_experts.py | 6 -- vllm/model_executor/models/afmoe.py | 13 +-- vllm/model_executor/models/aria.py | 7 +- vllm/model_executor/models/bailing_moe.py | 17 +--- vllm/model_executor/models/deepseek_v2.py | 34 +------ vllm/model_executor/models/dots1.py | 13 +-- vllm/model_executor/models/ernie45_moe.py | 11 --- vllm/model_executor/models/ernie45_vl_moe.py | 35 ++------ vllm/model_executor/models/exaone_moe.py | 12 +-- vllm/model_executor/models/glm4_moe.py | 20 +---- vllm/model_executor/models/hunyuan_v1.py | 7 -- vllm/model_executor/models/kimi_linear.py | 46 +++++----- vllm/model_executor/models/lfm2_moe.py | 12 +-- vllm/model_executor/models/llama4.py | 8 +- vllm/model_executor/models/minimax_m2.py | 5 -- vllm/model_executor/models/nemotron_h.py | 30 ++----- vllm/model_executor/models/openpangu.py | 24 +---- vllm/model_executor/models/qwen2_moe.py | 7 -- vllm/model_executor/models/qwen3_moe.py | 10 +-- vllm/model_executor/models/qwen3_next.py | 8 -- vllm/model_executor/models/step3_text.py | 4 - vllm/model_executor/models/step3p5.py | 21 +---- .../model_executor/models/transformers/moe.py | 41 +-------- 32 files changed, 126 insertions(+), 442 deletions(-) diff --git a/tests/kernels/moe/test_shared_fused_moe_routed_transform.py b/tests/kernels/moe/test_shared_fused_moe_routed_transform.py index b6ef19ddaf3c..ccba92b81782 100644 --- a/tests/kernels/moe/test_shared_fused_moe_routed_transform.py +++ b/tests/kernels/moe/test_shared_fused_moe_routed_transform.py @@ -98,7 +98,6 @@ def test_routed_input_transform_inside_vs_outside( top_k=top_k, hidden_size=latent_size, intermediate_size=intermediate_size, - reduce_results=False, renormalize=True, params_dtype=dtype, tp_size=1, @@ -116,7 +115,6 @@ def test_routed_input_transform_inside_vs_outside( top_k=top_k, hidden_size=latent_size, intermediate_size=intermediate_size, - reduce_results=False, renormalize=True, params_dtype=dtype, tp_size=1, @@ -140,28 +138,19 @@ def test_routed_input_transform_inside_vs_outside( router_logits = torch.randn(num_tokens, num_experts, device="cuda", dtype=dtype) with set_forward_context(None, vllm_config, num_tokens=num_tokens): - shared_out_A, routed_out_A = moe_with_transform( - hidden_states, router_logits - ) + # Method A: combined output (shared + routed) + combined_A = moe_with_transform(hidden_states, router_logits) + # Method B: manually transform, get routed output, add shared transformed_hidden = routed_transform(hidden_states) - shared_out_B, routed_out_B = moe_without_transform( - transformed_hidden, router_logits - ) + routed_out_B = moe_without_transform(transformed_hidden, router_logits) + shared_out_B = shared_experts(hidden_states) + combined_B = shared_out_B + routed_out_B torch.testing.assert_close( - routed_out_A, - routed_out_B, - atol=1e-3, - rtol=1e-3, - msg="Routed output should match: transform inside vs outside", - ) - - expected_shared_out = shared_experts(hidden_states) - - torch.testing.assert_close( - shared_out_A, - expected_shared_out, + combined_A, + combined_B, atol=1e-3, rtol=1e-3, + msg="Combined output should match: transform inside vs outside", ) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index a7aaeff4fc85..c1ead5e2034f 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -211,7 +211,7 @@ class ForwardContext: # For torch.compile cold start times, we need to avoid hard-coding # any strings into the graph. Right now, the vllm.moe_forward - # and vllm.moe_forward_shared custom operators hard-code strings into + # custom operator hard-codes strings into # the graph. # # The workaround is to store a list of the strings that each of those diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 11332f4cb228..fd9023b01fef 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -593,9 +593,6 @@ def set_lora( def forward(self, *args, **kwargs): return self.base_layer.forward(*args, **kwargs) - def maybe_all_reduce_tensor_model_parallel(self, *args, **kwargs): - return self.base_layer.maybe_all_reduce_tensor_model_parallel(*args, **kwargs) - @property def quant_method(self): return self.base_layer.quant_method diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 66a7eb6b85d3..9cb82780ac22 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -273,7 +273,6 @@ class FusedMoE(CustomOp): hidden_size: Input hidden state size of the transformer intermediate_size: Intermediate size of the experts params_dtype: Data type for the parameters. - reduce_results: Whether to all_reduce on the output of the layer renormalize: Whether to renormalize the logits in the fused_moe kernel quant_config: Quantization configure. enable_eplb: Whether to enable expert parallelism load balancer. @@ -289,7 +288,6 @@ def __init__( hidden_size: int, intermediate_size: int, params_dtype: torch.dtype | None = None, - reduce_results: bool = False, renormalize: bool = True, use_grouped_topk: bool = False, num_expert_group: int | None = None, @@ -317,11 +315,15 @@ def __init__( gate: torch.nn.Module | None = None, shared_experts: torch.nn.Module | None = None, routed_input_transform: torch.nn.Module | None = None, + routed_output_transform: torch.nn.Module | None = None, + output_scale: float | None = None, zero_expert_type: str | None = None, ): super().__init__() self._routed_input_transform = routed_input_transform + self._routed_output_transform = routed_output_transform + self._output_scale = output_scale if params_dtype is None: params_dtype = torch.get_default_dtype() @@ -468,7 +470,6 @@ def __init__( assert intermediate_size % self.tp_size == 0 self.intermediate_size_per_partition = intermediate_size // self.tp_size - self.reduce_results = reduce_results self.renormalize = renormalize # TODO(bnell): these attributes are only used by monolithic kernels. @@ -667,7 +668,6 @@ def _init_shared_experts(self): # -> SharedExperts | None: # called, i.e. by a MK or by the MoERunner. # Once the MK can be created upfront, we can just pass in the proper # flags derived from the quant_method's MK. - reduce_results=self.reduce_results, quant_method=self.quant_method, ) @@ -684,8 +684,9 @@ def _init_runner(self) -> MoERunner: gate=self._gate, shared_experts=self.shared_experts, quant_method=self.quant_method, - reduce_results=self.reduce_results, enable_dbo=self.vllm_config.parallel_config.enable_dbo, + routed_output_transform=self._routed_output_transform, + output_scale=self._output_scale, ) # TODO(bnell): This method is provided as a hook so vllm/lora/layers/fused_moe.py @@ -1505,17 +1506,11 @@ def moe_quant_config(self) -> FusedMoEQuantConfig | None: self.ensure_moe_quant_config_init() return self.quant_method.moe_quant_config - def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tensor): - """ - Some combine kernels reduce across GPU ranks by default. - """ - return self.runner.maybe_all_reduce_tensor_model_parallel(final_hidden_states) - def forward_native( self, hidden_states: torch.Tensor, router_logits: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: return self.runner.forward( hidden_states, router_logits, @@ -1531,7 +1526,7 @@ def forward_cuda( self, hidden_states: torch.Tensor, router_logits: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: return self.forward_native(hidden_states, router_logits) @classmethod @@ -1588,7 +1583,6 @@ def extra_repr(self) -> str: f"intermediate_size_per_partition={self.intermediate_size_per_partition}, " # noqa: E501 f"tp_size={self.tp_size},\n" f"ep_size={self.ep_size}, " - f"reduce_results={self.reduce_results}, " ) return s diff --git a/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py index 6626fad557b3..8aee95840cb5 100644 --- a/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py @@ -27,12 +27,11 @@ class ChunkingMoERunner(MoERunnerBase): All MoERunnerBase state (moe_config, router, quant_method, etc.) is transparently delegated to the inner runner via __getattr__. ChunkingMoERunner only owns chunking-specific state: the pre-allocated - workspace buffers and the reduce_results override. + workspace buffers. Key behaviors: - Pre-allocates workspace tensors for CUDA graph compatibility - Processes chunks via inner._forward_impl per chunk - - Never reduces results (reduce_results always returns False) """ def __init__(self, inner: MoERunnerBase): @@ -55,10 +54,6 @@ def __getattr__(self, name): # so ChunkingMoERunner's own attributes and methods take priority. return getattr(self._inner, name) - @property - def reduce_results(self) -> bool: - return False - def _init_dp_chunking(self) -> list[torch.Tensor]: states_shape: tuple[int, ...] logits_shape: tuple[int, ...] diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index 7b583bda14cc..523a5362da51 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -61,8 +61,9 @@ def __init__( gate: torch.nn.Module | None, shared_experts: SharedExperts | None, quant_method: FusedMoEMethodBase, - reduce_results: bool, enable_dbo: bool, + routed_output_transform: torch.nn.Module | None = None, + output_scale: float | None = None, ): super().__init__( layer, @@ -72,14 +73,11 @@ def __init__( gate, shared_experts, quant_method, - reduce_results, enable_dbo, + routed_output_transform=routed_output_transform, + output_scale=output_scale, ) - @property - def reduce_results(self) -> bool: - return self._reduce_results - @property def do_naive_dispatch_combine(self) -> bool: return ( diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index cb0539186a67..856ae969214d 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -19,14 +19,7 @@ def forward( self, hidden_states: torch.Tensor, router_logits: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - raise NotImplementedError - - @abstractmethod - def maybe_all_reduce_tensor_model_parallel( - self, - final_hidden_states: torch.Tensor, - ): + ) -> torch.Tensor: raise NotImplementedError @abstractmethod diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index b7f01ee98037..9eaddcdff9f5 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -54,7 +54,7 @@ def get_layer_from_name(layer_name: str) -> torch.nn.Module: raise AssertionError( "We expected the number of MOE layers in `all_moe_layers` " "to be equal to the number of " - "{vllm.moe_forward, vllm.moe_forward_shared} calls." + "vllm.moe_forward calls." ) layer_name = all_moe_layers[moe_layer_index] forward_context.moe_layer_index += 1 @@ -99,6 +99,10 @@ def _moe_forward_fake( shared_experts_input: torch.Tensor | None, layer_name: _layer_name_type, ) -> torch.Tensor: + # For latent MoE with reduce_results=True: output has full hidden_size + # (from shared_experts_input), not latent_size (from hidden_states). + if shared_experts_input is not None: + return torch.empty_like(shared_experts_input) return torch.empty_like(hidden_states) @@ -170,7 +174,6 @@ class MoERunnerBase(MoERunner): allowing flexibility in the actual MoE computation implementation. Key abstract methods that subclasses must implement: - - reduce_results: Determines whether results should be reduced across ranks - _forward_impl: The core MoE computation logic specific to each runner type """ @@ -183,8 +186,9 @@ def __init__( gate: torch.nn.Module | None, shared_experts: SharedExperts | None, quant_method: FusedMoEMethodBase, - reduce_results: bool, enable_dbo: bool, + routed_output_transform: torch.nn.Module | None = None, + output_scale: float | None = None, ): super().__init__() self.moe_config = moe_config @@ -193,9 +197,10 @@ def __init__( self.gate = gate self.shared_experts = shared_experts self.quant_method = quant_method - self._reduce_results = reduce_results self.enable_dbo = enable_dbo self.enable_eplb = moe_config.moe_parallel_config.enable_eplb + self.routed_output_transform = routed_output_transform + self.output_scale = output_scale # Needed for string -> FusedMoE layer lookup in custom ops. self.layer_name = layer.layer_name @@ -218,11 +223,6 @@ def _select_forward(self, layer: torch.nn.Module) -> Callable: def is_internal_router(self) -> bool: return self.gate is not None - @property - @abstractmethod - def reduce_results(self) -> bool: - raise NotImplementedError - def _must_reduce_shared_expert_outputs(self) -> bool: """ The shared_experts are typically computed using the RowParallelLinear @@ -260,7 +260,7 @@ def apply_routed_input_transform( routed experts get the transformed [S, moe_latent_size]. TODO: For latent MoE bandwidth optimization, fc2_latent_proj could be - moved inside SharedFusedMoE to all-reduce on the smaller latent + moved inside FusedMoE to all-reduce on the smaller latent dimension. Returns (possibly transformed) hidden states and the input for shared @@ -279,34 +279,23 @@ def apply_routed_input_transform( hidden_states if self.shared_experts is not None else None, ) - def _maybe_reduce_output( + def _combine_and_reduce( self, - states: torch.Tensor | tuple[torch.Tensor, torch.Tensor], + states: torch.Tensor, trunc_sizes: list[int], - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - def trunc(x: torch.Tensor, trunc_size: int) -> torch.Tensor: - return x[..., :trunc_size] + ) -> torch.Tensor: + assert len(trunc_sizes) == 1 + result = states[..., : trunc_sizes[0]] - def reduce_and_trunc(x: torch.Tensor, trunc_size: int) -> torch.Tensor: - return trunc(self.maybe_all_reduce_tensor_model_parallel(x), trunc_size) + if self.output_scale is not None: + result = result * self.output_scale - if ( - not self.moe_config.is_sequence_parallel - and self.reduce_results - and (self.moe_config.tp_size > 1 or self.moe_config.ep_size > 1) + if not self.moe_config.is_sequence_parallel and ( + self.moe_config.tp_size > 1 or self.moe_config.ep_size > 1 ): - func = reduce_and_trunc - else: - func = trunc + result = self.maybe_all_reduce_tensor_model_parallel(result) - if isinstance(states, tuple): - return ( - trunc(states[0], trunc_sizes[0]), # shared: already reduced - func(states[1], trunc_sizes[1]), # fused: reduce here - ) - else: - assert len(trunc_sizes) == 1 - return func(states, trunc_sizes[0]) + return result def _encode_layer_name(self) -> str | ModuleName: if HAS_OPAQUE_TYPE: @@ -339,8 +328,8 @@ def _maybe_pad_hidden_states( value=0.0, ) - if self.shared_experts is not None: - orig_hidden_dims = [shared_experts_hidden_dim, transformed_hidden_dim] + if self.routed_output_transform is not None and shared_experts_hidden_dim > 0: + orig_hidden_dims = [shared_experts_hidden_dim] else: orig_hidden_dims = [transformed_hidden_dim] @@ -426,22 +415,19 @@ def _maybe_overlap_gate_with_shared_experts( def _maybe_add_zero_expert_output( self, - result: torch.Tensor | tuple[torch.Tensor, torch.Tensor], - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + result: torch.Tensor, + ) -> torch.Tensor: if isinstance(self.router, ZeroExpertRouter): zero_expert_output = self.router.zero_expert_output assert zero_expert_output is not None - if isinstance(result, tuple): - result = (result[0], result[1] + zero_expert_output) - else: - result = result + zero_expert_output + result = result + zero_expert_output return result def forward( self, hidden_states: torch.Tensor, router_logits: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: """Invoke the fused moe layer. Input: @@ -450,8 +436,6 @@ def forward( Output: - The new hidden_states. - or - - A tuple of (shared experts output, new hidden_states). Calling sequence - forward @@ -488,7 +472,7 @@ def forward( self._encode_layer_name(), ) - result = self._maybe_reduce_output(fused_output, og_hidden_dims) + result = self._combine_and_reduce(fused_output, og_hidden_dims) return self._maybe_add_zero_expert_output(result) @@ -498,7 +482,7 @@ def forward_dispatch( hidden_states: torch.Tensor, router_logits: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: # TODO(bnell): this can be removed after MK migration is complete. layer.ensure_moe_quant_config_init() @@ -514,13 +498,26 @@ def forward_dispatch( ) with self._sequence_parallel_context(): - return self._forward_impl( + result = self._forward_impl( layer, hidden_states, router_logits, shared_experts_input, ) + if isinstance(result, tuple): + shared_output, fused_output = result + # Apply output transform (e.g. latent → full dim) + if self.routed_output_transform is not None: + r = self.routed_output_transform(fused_output) + fused_output = r[0] if isinstance(r, tuple) else r + # If combine kernel already reduced fused, reduce shared to match + if self._must_reduce_shared_expert_outputs(): + shared_output = tensor_model_parallel_all_reduce(shared_output) + result = shared_output + fused_output + + return result + @abstractmethod def _forward_impl( self, diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py index 4bda1ca89304..e6100814cad8 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py @@ -32,8 +32,9 @@ def create_moe_runner( gate: torch.nn.Module | None, shared_experts: SharedExperts | None, quant_method: FusedMoEMethodBase, - reduce_results: bool, enable_dbo: bool, + routed_output_transform: torch.nn.Module | None = None, + output_scale: float | None = None, ) -> MoERunner: runner = DefaultMoERunner( layer, @@ -43,8 +44,9 @@ def create_moe_runner( gate, shared_experts, quant_method, - reduce_results, enable_dbo, + routed_output_transform=routed_output_transform, + output_scale=output_scale, ) if moe_config.moe_parallel_config.use_dp_chunking: return ChunkingMoERunner(runner) diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index fb3d45678013..10c1e1cf5060 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -5,10 +5,6 @@ import torch import vllm.envs as envs -from vllm.distributed import ( - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce, -) from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, @@ -50,7 +46,6 @@ def __init__( layer: torch.nn.Module, moe_config: FusedMoEConfig, quant_method: QuantizeMethodBase, - reduce_results: bool, ): from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, @@ -64,7 +59,6 @@ def __init__( self._layer = layer self._moe_config = moe_config self._quant_method = quant_method - self._reduce_results = reduce_results self._use_dp_chunking = moe_config.moe_parallel_config.use_dp_chunking # Allow disabling of the separate shared experts stream for diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index 22037336411a..5bad52a0c496 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -131,7 +131,6 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, renormalize=self.route_norm if self.score_func == "sigmoid" else False, quant_config=quant_config, use_grouped_topk=True, @@ -152,20 +151,10 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: router_logits = self.gate(hidden_states.to(dtype=torch.float32)) - fused_moe_out = self.experts( + final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits ) - if self.shared_experts is not None: - shared_output, final_hidden_states = fused_moe_out - final_hidden_states = final_hidden_states + shared_output - else: - final_hidden_states = fused_moe_out - if self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( - final_hidden_states - ) - return final_hidden_states.view(num_tokens, hidden_dim) diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 908581786450..221e991dfc29 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -301,12 +301,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: router_output = torch.nn.functional.linear(hidden_states, self.router_weight) - sparse_expert_output = self.experts(hidden_states, router_output) - - if self.shared_experts is not None: - return sparse_expert_output[0] + sparse_expert_output[1] - else: - return sparse_expert_output + return self.experts(hidden_states, router_output) class AriaTextDecoderLayer(LlamaDecoderLayer): diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index 7725dfa2a887..510d605f8046 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -291,7 +291,6 @@ def __init__( top_k=self.top_k, hidden_size=self.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, renormalize=self.norm_expert_prob, quant_config=quant_config, prefix=f"{prefix}.experts", @@ -301,6 +300,7 @@ def __init__( topk_group=self.topk_group, use_grouped_topk=self.use_grouped_topk, router_logits_dtype=self.router_dtype, + routed_scaling_factor=self.routed_scaling_factor, ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -314,21 +314,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits ) - - if self.shared_experts is not None: - shared_output, final_hidden_states = final_hidden_states - else: - shared_output = None - - final_hidden_states *= self.routed_scaling_factor - - if shared_output is not None: - final_hidden_states = final_hidden_states + shared_output - - if self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( - final_hidden_states - ) return final_hidden_states.view(num_tokens, hidden_size) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index f31e9ac3e840..1a64299789fb 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -312,7 +312,6 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, renormalize=config.norm_topk_prob, quant_config=quant_config, use_grouped_topk=True, @@ -320,11 +319,7 @@ def __init__( topk_group=getattr(config, "topk_group", 1), prefix=f"{prefix}.experts", scoring_func=getattr(config, "scoring_func", "softmax"), - # we do scaling outside, set factor to 1.0 to avoid double mul - # aiter applies routed_scaling_factor internally - routed_scaling_factor=1.0 - if not self.is_rocm_aiter_moe_enabled - else self.routed_scaling_factor, + routed_scaling_factor=self.routed_scaling_factor, e_score_correction_bias=self.gate.e_score_correction_bias, enable_eplb=self.enable_eplb, num_redundant_experts=self.n_redundant_experts, @@ -357,43 +352,20 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = sequence_parallel_chunk(hidden_states) if self.experts.is_internal_router: - # In this case, the gate/router runs inside the FusedMoE class - fused_moe_out = self.experts( + final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=hidden_states ) else: - # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) - fused_moe_out = self.experts( + final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits ) - shared_output, final_hidden_states = fused_moe_out - if self.shared_experts is None: - assert shared_output is None - - # Fix FP16 overflow - # See DeepseekV2DecoderLayer for more details. - if hidden_states.dtype != torch.float16: - if not self.is_rocm_aiter_moe_enabled: - final_hidden_states *= self.routed_scaling_factor - elif self.shared_experts is not None: - assert shared_output is not None - shared_output *= 1.0 / self.routed_scaling_factor - - if self.shared_experts is not None: - assert shared_output is not None - final_hidden_states += shared_output - if self.is_sequence_parallel: final_hidden_states = tensor_model_parallel_all_gather( final_hidden_states, 0 ) final_hidden_states = final_hidden_states[:num_tokens] - elif self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( - final_hidden_states - ) return final_hidden_states.view(num_tokens, hidden_dim) diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index 4e393145462a..fc2636db48ab 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -37,7 +37,6 @@ from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -120,7 +119,6 @@ def __init__( prefix: str = "", ): super().__init__() - self.tp_size = get_tensor_model_parallel_world_size() self.routed_scaling_factor = config.routed_scaling_factor self.n_shared_experts = config.n_shared_experts @@ -163,7 +161,6 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, renormalize=config.norm_topk_prob, quant_config=quant_config, use_grouped_topk=True, @@ -174,6 +171,7 @@ def __init__( # we do scaling outside, set factor to 1.0 to avoid double mul routed_scaling_factor=1.0, e_score_correction_bias=self.gate.e_score_correction_bias, + output_scale=self.routed_scaling_factor, ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -182,16 +180,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: router_logits, _ = self.gate(hidden_states) - shared_out, routed_out = self.experts( + final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits ) - if self.shared_experts is not None: - final_hidden_states = (routed_out + shared_out) * self.routed_scaling_factor - else: - final_hidden_states = routed_out * self.routed_scaling_factor - - if self.tp_size > 1: - final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states) return final_hidden_states.view(num_tokens, hidden_dim) diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index f038cfb21f28..c92e230bcd21 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -194,7 +194,6 @@ def __init__( top_k=config.moe_k, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, renormalize=True, quant_config=quant_config, prefix=f"{prefix}.experts", @@ -215,16 +214,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states=hidden_states, router_logits=router_logits ) - if self.has_shared_experts: - final_hidden_states = final_hidden_states[0] + final_hidden_states[1] - else: - final_hidden_states = final_hidden_states[1] - - if self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( - final_hidden_states - ) - return final_hidden_states.view(orig_shape) diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 418fdcfa072b..e4b7ac6fb006 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -263,7 +263,6 @@ def __init__( top_k=config.moe_k, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size[0], - reduce_results=False, renormalize=True, quant_config=quant_config, e_score_correction_bias=self.e_score_correction_bias[0], @@ -301,7 +300,6 @@ def __init__( top_k=config.moe_k, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size[1], - reduce_results=False, renormalize=True, quant_config=quant_config, e_score_correction_bias=self.e_score_correction_bias[1], @@ -342,9 +340,6 @@ def forward( visual_token_mask = visual_token_mask.repeat(1, self.hidden_size).bool() text_token_mask = ~visual_token_mask final_experts_hidden_states = torch.zeros_like(hidden_states) - final_shared_output = ( - torch.zeros_like(hidden_states) if self.has_shared_experts else None - ) text_hidden_states = hidden_states[text_token_mask].reshape( -1, self.hidden_size @@ -356,26 +351,20 @@ def forward( text_router_logits, _ = self.text_experts_gate( text_hidden_states.to(dtype=torch.float32) ) - text_shared_output, text_experts_output = self.text_experts( + text_output = self.text_experts( hidden_states=text_hidden_states, router_logits=text_router_logits ) - final_experts_hidden_states[text_token_mask] = text_experts_output.flatten() - if self.has_shared_experts: - final_shared_output[text_token_mask] = text_shared_output.flatten() + final_experts_hidden_states[text_token_mask] = text_output.flatten() vision_router_logits, _ = self.vision_experts_gate( vision_hidden_states.to(dtype=torch.float32) ) - vision_shared_output, vision_experts_output = self.vision_experts( + vision_output = self.vision_experts( hidden_states=vision_hidden_states, router_logits=vision_router_logits ) - final_experts_hidden_states[visual_token_mask] = ( - vision_experts_output.flatten() - ) - if self.has_shared_experts: - final_shared_output[visual_token_mask] = vision_shared_output.flatten() + final_experts_hidden_states[visual_token_mask] = vision_output.flatten() - final_hidden_states = (final_shared_output, final_experts_hidden_states) + final_hidden_states = final_experts_hidden_states else: # only text modal input text_router_logits, _ = self.text_experts_gate( @@ -386,20 +375,6 @@ def forward( hidden_states=hidden_states, router_logits=text_router_logits ) - if self.has_shared_experts: - # for shared_experts model - final_hidden_states = final_hidden_states[0] + final_hidden_states[1] - else: - # for not shared_experts model - final_hidden_states = final_hidden_states[1] - - if self.tp_size > 1: - final_hidden_states = ( - self.text_experts.maybe_all_reduce_tensor_model_parallel( - final_hidden_states - ) - ) - return final_hidden_states.view(orig_shape) diff --git a/vllm/model_executor/models/exaone_moe.py b/vllm/model_executor/models/exaone_moe.py index 66d05e095ac5..a46cadf007ee 100644 --- a/vllm/model_executor/models/exaone_moe.py +++ b/vllm/model_executor/models/exaone_moe.py @@ -137,7 +137,6 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, renormalize=config.norm_topk_prob, quant_config=quant_config, use_grouped_topk=True, @@ -157,19 +156,10 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_dim = hidden_states.shape[-1] hidden_states = hidden_states.view(-1, hidden_dim) - shared_output, final_hidden_states = self.experts( + final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=hidden_states ) - if self.shared_experts is not None: - assert shared_output is not None - final_hidden_states = final_hidden_states + shared_output - - if self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( # noqa E501 - final_hidden_states - ) - return final_hidden_states.view(orig_shape) diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index d0e6cb6ada8b..cef934222382 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -184,7 +184,6 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, renormalize=config.norm_topk_prob, quant_config=quant_config, use_grouped_topk=True, @@ -192,8 +191,7 @@ def __init__( topk_group=config.topk_group, prefix=f"{prefix}.experts", scoring_func="sigmoid", - # we do scaling outside, set factor to 1.0 to avoid double mul - routed_scaling_factor=1.0, + routed_scaling_factor=self.routed_scaling_factor, e_score_correction_bias=self.gate.e_score_correction_bias, enable_eplb=self.enable_eplb, num_redundant_experts=self.n_redundant_experts, @@ -207,23 +205,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # router_logits: (num_tokens, n_experts) router_logits = self.gate(hidden_states.to(dtype=torch.float32)) - fused_moe_out = self.experts( + final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits ) - - if self.shared_experts is not None: - shared_output, final_hidden_states = fused_moe_out - assert shared_output is not None - final_hidden_states = ( - final_hidden_states * self.routed_scaling_factor + shared_output - ) - else: - final_hidden_states = fused_moe_out * self.routed_scaling_factor - - if self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( - final_hidden_states - ) return final_hidden_states.view(num_tokens, hidden_dim) diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index a0130402c66f..35d30006a66a 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -39,7 +39,6 @@ get_ep_group, get_pp_group, get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention @@ -445,7 +444,6 @@ def __init__( top_k=top_k, hidden_size=config.hidden_size, intermediate_size=intermediate_size, - reduce_results=False, renormalize=top_k > 1, quant_config=quant_config, prefix=f"{prefix}.experts", @@ -464,11 +462,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits ) - if self.shared_mlp is not None: - final_hidden_states = final_hidden_states[0] + final_hidden_states[1] - - if self.tp_size > 1: - final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states) return final_hidden_states.view(orig_shape) diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py index 4cd7b63c1472..e586a3ac3469 100644 --- a/vllm/model_executor/models/kimi_linear.py +++ b/vllm/model_executor/models/kimi_linear.py @@ -11,11 +11,10 @@ from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce, ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.kda import KimiDeltaAttention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( @@ -132,12 +131,25 @@ def __init__( self.gate.e_score_correction_bias = nn.Parameter(torch.empty(num_experts)) - self.experts = FusedMoE( + if self.num_shared_experts is not None: + intermediate_size = moe_intermediate_size * self.num_shared_experts + self.shared_experts = KimiMLP( + hidden_size=config.hidden_size, + intermediate_size=intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=False, + prefix=f"{prefix}.shared_experts", + ) + else: + self.shared_experts = None + + self.experts = SharedFusedMoE( + shared_experts=self.shared_experts, num_experts=num_experts, top_k=config.num_experts_per_token, hidden_size=hidden_size, intermediate_size=moe_intermediate_size, - reduce_results=False, renormalize=moe_renormalize, quant_config=quant_config, use_grouped_topk=config.use_grouped_topk, @@ -146,34 +158,16 @@ def __init__( prefix=f"{prefix}.experts", scoring_func=config.moe_router_activation_func, e_score_correction_bias=self.gate.e_score_correction_bias, + routed_scaling_factor=self.routed_scaling_factor, ) - if self.num_shared_experts is not None: - intermediate_size = moe_intermediate_size * self.num_shared_experts - self.shared_experts = KimiMLP( - hidden_size=config.hidden_size, - intermediate_size=intermediate_size, - hidden_act=config.hidden_act, - quant_config=quant_config, - reduce_results=False, - prefix=f"{prefix}.shared_experts", - ) - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: num_tokens, hidden_size = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_size) - if self.num_shared_experts is not None: - shared_output = self.shared_experts(hidden_states) router_logits, _ = self.gate(hidden_states) - final_hidden_states = ( - self.experts(hidden_states=hidden_states, router_logits=router_logits) - * self.routed_scaling_factor + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=router_logits ) - if shared_output is not None: - final_hidden_states = final_hidden_states + shared_output - - if self.tp_size > 1: - final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states) return final_hidden_states.view(num_tokens, hidden_size) @@ -482,7 +476,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if self.config.is_moe: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = FusedMoE.make_expert_params_mapping( + expert_params_mapping = SharedFusedMoE.make_expert_params_mapping( self, ckpt_gate_proj_name="w1", ckpt_down_proj_name="w2", diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index d955b7127adc..4b49430c1faf 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -150,7 +150,6 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, renormalize=config.norm_topk_prob, quant_config=quant_config, use_grouped_topk=True, # needed for softmax score func @@ -161,6 +160,7 @@ def __init__( num_redundant_experts=self.n_redundant_experts, scoring_func="sigmoid", e_score_correction_bias=self.gate.e_score_correction_bias, + routed_scaling_factor=self.routed_scaling_factor, ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -170,16 +170,10 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) - final_hidden_states = ( - self.experts(hidden_states=hidden_states, router_logits=router_logits) - * self.routed_scaling_factor + final_hidden_states = self.experts( + hidden_states=hidden_states, router_logits=router_logits ) - if self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( # noqa E501 - final_hidden_states - ) - return final_hidden_states.view(orig_shape) diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index b84b4e2ae512..a1c0ac896052 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -135,7 +135,6 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""): custom_routing_function=Llama4MoE.custom_routing_function, intermediate_size=intermediate_size_moe, apply_router_weight_on_input=True, - reduce_results=False, renormalize=False, quant_config=quant_config, prefix=f"{prefix}.experts", @@ -151,19 +150,14 @@ def forward(self, hidden_states): router_logits, _ = self.router(hidden_states) - shared_out, routed_out = self.experts( + experts_out = self.experts( hidden_states=hidden_states, router_logits=router_logits, ) - experts_out = routed_out + shared_out if self.is_sequence_parallel: experts_out = tensor_model_parallel_all_gather(experts_out, 0) experts_out = experts_out[:num_tokens] - elif self.tp_size > 1: - experts_out = self.experts.maybe_all_reduce_tensor_model_parallel( - experts_out - ) return experts_out diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index 2dc0f33cc1c5..1b068c8e4c6c 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -35,7 +35,6 @@ from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE @@ -103,7 +102,6 @@ def __init__( e_score_correction_bias=self.e_score_correction_bias, hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, - reduce_results=False, renormalize=True, quant_config=quant_config, prefix=f"{prefix}.experts", @@ -133,9 +131,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits ) - final_hidden_states = final_hidden_states - if self.tp_size > 1: - final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states) return final_hidden_states.view(num_tokens, hidden_dim) diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 4ec794eccf72..52fdd09f1279 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -216,7 +216,6 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=self.moe_hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, renormalize=config.norm_topk_prob, quant_config=quant_config, use_grouped_topk=True, @@ -231,6 +230,10 @@ def __init__( num_redundant_experts=self.n_redundant_experts, is_sequence_parallel=self.is_sequence_parallel, routed_input_transform=self.fc1_latent_proj, + routed_output_transform=self.fc2_latent_proj + if self.use_latent_moe + else None, + routed_scaling_factor=self.routed_scaling_factor, ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -243,38 +246,15 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) - # SharedFusedMoE handles: - # - shared experts (with original hidden_states) - # - routed_input_transform (fc1_latent_proj) for latent MoE - # - multistream parallelism between shared and routed experts - shared_output, final_hidden_states = self.experts( + final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits ) - # Fix FP16 overflow - # See DeepseekV2DecoderLayer for more details. - if hidden_states.dtype != torch.float16: - final_hidden_states *= self.routed_scaling_factor - elif self.shared_experts is not None: - shared_output *= 1.0 / self.routed_scaling_factor - - # TODO: See SharedFusedMoE.apply_routed_input_transform - # for bandwidth optimization - if self.use_latent_moe: - final_hidden_states, _ = self.fc2_latent_proj(final_hidden_states) - - if self.shared_experts is not None: - final_hidden_states += shared_output - if self.is_sequence_parallel: final_hidden_states = tensor_model_parallel_all_gather( final_hidden_states, 0 ) final_hidden_states = final_hidden_states[:num_tokens] - elif self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( - final_hidden_states - ) return final_hidden_states.view(num_tokens, hidden_dim) diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 994ae82529ab..789e4260836f 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -206,7 +206,6 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, renormalize=config.norm_topk_prob, quant_config=quant_config, use_grouped_topk=True, @@ -214,8 +213,7 @@ def __init__( topk_group=1, prefix=f"{prefix}.experts", scoring_func="sigmoid", - # we do scaling outside, set factor to 1.0 to avoid double mul - routed_scaling_factor=1.0, + routed_scaling_factor=self.routed_scaling_factor, e_score_correction_bias=self.gate.e_score_correction_bias, enable_eplb=self.enable_eplb, num_redundant_experts=self.n_redundant_experts, @@ -234,33 +232,15 @@ def forward( router_logits, _ = self.gate(hidden_states) - fused_moe_out = self.experts( + final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits ) - shared_output, final_hidden_states = fused_moe_out - if self.shared_experts is None: - assert shared_output is None - - if hidden_states.dtype != torch.float16: - final_hidden_states *= self.routed_scaling_factor - elif self.shared_experts is not None: - assert shared_output is not None - shared_output *= 1.0 / self.routed_scaling_factor - - if self.shared_experts is not None: - assert shared_output is not None - final_hidden_states += shared_output - if self.is_sequence_parallel: final_hidden_states = tensor_model_parallel_all_gather( final_hidden_states, 0 ) final_hidden_states = final_hidden_states[:num_tokens] - elif self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( - final_hidden_states - ) return final_hidden_states.view(num_tokens, hidden_dim) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 4b0c756165a5..1234aad77be2 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -167,7 +167,6 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, renormalize=config.norm_topk_prob, quant_config=quant_config, prefix=f"{prefix}.experts", @@ -184,12 +183,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits ) - if self.shared_expert is not None: - final_hidden_states = final_hidden_states[0] + final_hidden_states[1] - if self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( # noqa E501 - final_hidden_states - ) return final_hidden_states.view(orig_shape) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index f2ce070be8b4..f0f69d435379 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -212,7 +212,6 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, renormalize=config.norm_topk_prob, quant_config=quant_config, prefix=f"{prefix}.experts", @@ -234,22 +233,15 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) - shared_out, fused_out = self.experts( + final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits ) - final_hidden_states = ( - shared_out + fused_out if shared_out is not None else fused_out - ) if self.is_sequence_parallel: final_hidden_states = tensor_model_parallel_all_gather( final_hidden_states, 0 ) final_hidden_states = final_hidden_states[:num_tokens] - elif self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( # noqa E501 - final_hidden_states - ) # return to 1d if input is 1d return final_hidden_states.squeeze(0) if is_input_1d else final_hidden_states diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 5adccf178def..7e0544fda88d 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -328,7 +328,6 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""): top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, renormalize=getattr(config, "norm_topk_prob", True), quant_config=quant_config, prefix=f"{prefix}.experts", @@ -358,18 +357,11 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states=hidden_states, router_logits=router_logits ) - if self.shared_expert is not None: - final_hidden_states = final_hidden_states[0] + final_hidden_states[1] - if self.is_sequence_parallel: final_hidden_states = tensor_model_parallel_all_gather( final_hidden_states, 0 ) final_hidden_states = final_hidden_states[:num_tokens] - elif self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( # noqa E501 - final_hidden_states - ) return final_hidden_states.view(orig_shape) diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 18b689166a5f..636a121c590b 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -14,7 +14,6 @@ from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce, ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul @@ -71,7 +70,6 @@ def __init__( top_k=config.moe_top_k, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, renormalize=config.norm_expert_weight, quant_config=quant_config, prefix=f"{prefix}.experts", @@ -94,8 +92,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits ) - if self.tp_size > 1: - final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states) return final_hidden_states.view(orig_shape) diff --git a/vllm/model_executor/models/step3p5.py b/vllm/model_executor/models/step3p5.py index bb4bf14a9632..018f78956029 100644 --- a/vllm/model_executor/models/step3p5.py +++ b/vllm/model_executor/models/step3p5.py @@ -379,7 +379,6 @@ def __init__( top_k=config.moe_top_k, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, renormalize=config.norm_expert_weight, quant_config=quant_config, activation=activation, @@ -397,30 +396,16 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = hidden_states.view(-1, hidden_dim) if self.experts.is_internal_router: - # In this case, the gate/router runs inside the FusedMoE class - fused_moe_out = self.experts( + final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=hidden_states ) else: - # router_logits: (num_tokens, n_experts) + # TODO(bnell): this gate could be moved into the FusedMoE? router_logits, _ = self.gate(hidden_states) - fused_moe_out = self.experts( + final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits ) - shared_output, final_hidden_states = fused_moe_out - if self.share_expert is None: - assert shared_output is None - - if self.share_expert is not None: - assert shared_output is not None - final_hidden_states += shared_output - - if self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( - final_hidden_states - ) - return final_hidden_states.view(num_tokens, hidden_dim) diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index f65a197abcfc..a654adcc289d 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -204,28 +204,6 @@ def recursive_replace(self): ) assert intermediate_size is not None - # If there are shared experts, the results are - # reduced after mlp.forward() not inside FusedMoE - num_shared_experts = getattr_iter( - text_config, - [ - "n_shared_experts", # DeepSeek, Docs, GLM - "moe_num_shared_experts", # Aria, Ernie - ], - 0, - ) - reduce_results = num_shared_experts == 0 - - def add_all_reduce(mlp: nn.Module): - """Adds an all-reduce to the output of `mlp.forward()`.""" - - class MLPWithAllReduce(mlp.__class__): - def forward(self, *args, **kwargs): - output = super().forward(*args, **kwargs) - return self.experts.maybe_all_reduce_tensor_model_parallel(output) - - mlp.__class__ = MLPWithAllReduce - # Unused kwargs since we use custom_routing_function: # - `scoring_func` and `e_score_correction_bias` only used for grouped # topk routing inside vLLM and are non-trivial to infer @@ -265,7 +243,7 @@ def forward(self, *args, **kwargs): self.num_physical_experts = num_experts + num_redundant_experts self.num_local_physical_experts = self.num_physical_experts // ep_size self.num_routed_experts = num_experts - self.num_shared_experts = num_shared_experts + self.num_shared_experts = 0 self.num_redundant_experts = num_redundant_experts # Recursively fuse MoE layers @@ -289,14 +267,10 @@ def _recursive_replace(module: nn.Module, prefix: str): if "bias" in experts_param_name: has_bias = True break - # Double check there are no shared experts - nonlocal reduce_results - if reduce_results: + # Detect shared experts if config doesn't specify + if self.num_shared_experts == 0: for mlp_param_name, _ in mlp.named_parameters(): if "shared_expert" in mlp_param_name: - reduce_results = False - # If the config does not specify num_shared_experts, but - # the model has shared experts, we assume there is one. self.num_shared_experts = 1 break # Replace experts module with FusedMoE @@ -305,7 +279,7 @@ def _recursive_replace(module: nn.Module, prefix: str): top_k=top_k, hidden_size=hidden_size, intermediate_size=intermediate_size, - reduce_results=reduce_results, + reduce_results=True, renormalize=renormalize, # Hard coded because topk happens in Transformers use_grouped_topk=False, @@ -326,13 +300,6 @@ def _recursive_replace(module: nn.Module, prefix: str): self.moe_layers.append(fused_experts) self.expert_weights.append(fused_experts.get_expert_weights()) self.num_moe_layers += 1 - # If results are not all-reduced in FusedMoE, ensure they - # are all-reduced at the end of mlp.forward() if tensor - # parallel or expert parallel is enabled - if not reduce_results and ( - fused_experts.tp_size > 1 or fused_experts.ep_size > 1 - ): - add_all_reduce(mlp) else: _recursive_replace(child_module, prefix=qual_name) From b4fd10790b8e5e0af5d77594e601cc1f52a0e3db Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 3 Mar 2026 18:58:54 -0500 Subject: [PATCH 051/191] cleanups Signed-off-by: Bill Nell --- vllm/forward_context.py | 2 +- .../fused_moe/runner/moe_runner_base.py | 98 ++++++++++++------- 2 files changed, 64 insertions(+), 36 deletions(-) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index c1ead5e2034f..a7aaeff4fc85 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -211,7 +211,7 @@ class ForwardContext: # For torch.compile cold start times, we need to avoid hard-coding # any strings into the graph. Right now, the vllm.moe_forward - # custom operator hard-codes strings into + # and vllm.moe_forward_shared custom operators hard-code strings into # the graph. # # The workaround is to store a list of the strings that each of those diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index 9eaddcdff9f5..1f277a8007cf 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -54,7 +54,7 @@ def get_layer_from_name(layer_name: str) -> torch.nn.Module: raise AssertionError( "We expected the number of MOE layers in `all_moe_layers` " "to be equal to the number of " - "vllm.moe_forward calls." + "{vllm.moe_forward, vllm.moe_forward_shared} calls." ) layer_name = all_moe_layers[moe_layer_index] forward_context.moe_layer_index += 1 @@ -99,13 +99,45 @@ def _moe_forward_fake( shared_experts_input: torch.Tensor | None, layer_name: _layer_name_type, ) -> torch.Tensor: - # For latent MoE with reduce_results=True: output has full hidden_size - # (from shared_experts_input), not latent_size (from hidden_states). - if shared_experts_input is not None: - return torch.empty_like(shared_experts_input) return torch.empty_like(hidden_states) +def _moe_forward_shared( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + layer_name: str, +) -> tuple[torch.Tensor, torch.Tensor]: + layer = get_layer_from_name(layer_name) + return layer.runner.forward_dispatch( + layer, + hidden_states, + router_logits, + shared_experts_input, + ) + + +def _moe_forward_shared_fake( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + layer_name: str, +) -> tuple[torch.Tensor, torch.Tensor]: + # Output shapes: + # - fused_out: same as hidden_states (routed experts use transformed size) + # - shared_out: same as shared_experts_input if provided, else same as + # hidden_states + # (For latent MoE: shared experts use original hidden_size, not latent size) + fused_out = torch.empty_like(hidden_states) + + if shared_experts_input is not None: + shared_out = torch.empty_like(shared_experts_input) + else: + shared_out = torch.empty_like(hidden_states) + + return shared_out, fused_out + + def _moe_forward_shared( hidden_states: torch.Tensor, router_logits: torch.Tensor, @@ -241,7 +273,9 @@ def _must_reduce_shared_expert_outputs(self) -> bool: and self.quant_method.moe_kernel.output_is_reduced() ) - def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tensor): + def _maybe_all_reduce_tensor_model_parallel( + self, final_hidden_states: torch.Tensor + ): """ Some combine kernels reduce across GPU ranks by default. """ @@ -259,10 +293,6 @@ def apply_routed_input_transform( is saved separately so shared experts get [S, hidden_size] while routed experts get the transformed [S, moe_latent_size]. - TODO: For latent MoE bandwidth optimization, fc2_latent_proj could be - moved inside FusedMoE to all-reduce on the smaller latent - dimension. - Returns (possibly transformed) hidden states and the input for shared experts (or None if there are no shared experts). """ @@ -282,10 +312,9 @@ def apply_routed_input_transform( def _combine_and_reduce( self, states: torch.Tensor, - trunc_sizes: list[int], + trunc_size: int, ) -> torch.Tensor: - assert len(trunc_sizes) == 1 - result = states[..., : trunc_sizes[0]] + result = states[..., :trunc_size] if self.output_scale is not None: result = result * self.output_scale @@ -293,7 +322,7 @@ def _combine_and_reduce( if not self.moe_config.is_sequence_parallel and ( self.moe_config.tp_size > 1 or self.moe_config.ep_size > 1 ): - result = self.maybe_all_reduce_tensor_model_parallel(result) + result = self._maybe_all_reduce_tensor_model_parallel(result) return result @@ -312,7 +341,7 @@ def _maybe_pad_hidden_states( self, shared_experts_input: torch.Tensor | None, hidden_states: torch.Tensor, - ) -> tuple[torch.Tensor, list[int]]: + ) -> tuple[torch.Tensor, int]: shared_experts_hidden_dim = ( shared_experts_input.shape[-1] if shared_experts_input is not None else 0 ) @@ -329,9 +358,9 @@ def _maybe_pad_hidden_states( ) if self.routed_output_transform is not None and shared_experts_hidden_dim > 0: - orig_hidden_dims = [shared_experts_hidden_dim] + orig_hidden_dims = shared_experts_hidden_dim else: - orig_hidden_dims = [transformed_hidden_dim] + orig_hidden_dims = transformed_hidden_dim return hidden_states, orig_hidden_dims @@ -460,19 +489,31 @@ def forward( hidden_states ) - hidden_states, og_hidden_dims = self._maybe_pad_hidden_states( + hidden_states, og_hidden_dim = self._maybe_pad_hidden_states( shared_experts_input, hidden_states, ) - fused_output = self.forward_entry( + result = self.forward_entry( hidden_states, router_logits, shared_experts_input, self._encode_layer_name(), ) - result = self._combine_and_reduce(fused_output, og_hidden_dims) + if self.shared_experts is not None: + assert isinstance(result, tuple) + shared_output, fused_output = result + # Apply output transform (e.g. latent → full dim) + if self.routed_output_transform is not None: + r = self.routed_output_transform(fused_output) + fused_output = r[0] if isinstance(r, tuple) else r + # If combine kernel already reduced fused, reduce shared to match + if self._must_reduce_shared_expert_outputs(): + shared_output = tensor_model_parallel_all_reduce(shared_output) + result = shared_output + fused_output + + result = self._combine_and_reduce(result, og_hidden_dim) return self._maybe_add_zero_expert_output(result) @@ -482,7 +523,7 @@ def forward_dispatch( hidden_states: torch.Tensor, router_logits: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor: + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: # TODO(bnell): this can be removed after MK migration is complete. layer.ensure_moe_quant_config_init() @@ -498,26 +539,13 @@ def forward_dispatch( ) with self._sequence_parallel_context(): - result = self._forward_impl( + return self._forward_impl( layer, hidden_states, router_logits, shared_experts_input, ) - if isinstance(result, tuple): - shared_output, fused_output = result - # Apply output transform (e.g. latent → full dim) - if self.routed_output_transform is not None: - r = self.routed_output_transform(fused_output) - fused_output = r[0] if isinstance(r, tuple) else r - # If combine kernel already reduced fused, reduce shared to match - if self._must_reduce_shared_expert_outputs(): - shared_output = tensor_model_parallel_all_reduce(shared_output) - result = shared_output + fused_output - - return result - @abstractmethod def _forward_impl( self, From 4f0580283f90e5c0dc163b2d2a24d96e8d106459 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 3 Mar 2026 21:59:30 -0500 Subject: [PATCH 052/191] apply_scale_to_output flag Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 7 ++++--- .../layers/fused_moe/runner/default_moe_runner.py | 6 ++++-- .../layers/fused_moe/runner/moe_runner_base.py | 12 +++++++----- .../layers/fused_moe/runner/moe_runner_factory.py | 6 ++++-- vllm/model_executor/models/deepseek_v2.py | 7 ++++++- vllm/model_executor/models/dots1.py | 2 +- vllm/model_executor/models/nemotron_h.py | 2 +- vllm/model_executor/models/openpangu.py | 4 +++- 8 files changed, 30 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 9cb82780ac22..57e7dc436874 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -316,14 +316,14 @@ def __init__( shared_experts: torch.nn.Module | None = None, routed_input_transform: torch.nn.Module | None = None, routed_output_transform: torch.nn.Module | None = None, - output_scale: float | None = None, + apply_scale_to_output: bool = False, zero_expert_type: str | None = None, ): super().__init__() self._routed_input_transform = routed_input_transform self._routed_output_transform = routed_output_transform - self._output_scale = output_scale + self._apply_scale_to_output = apply_scale_to_output if params_dtype is None: params_dtype = torch.get_default_dtype() @@ -686,7 +686,8 @@ def _init_runner(self) -> MoERunner: quant_method=self.quant_method, enable_dbo=self.vllm_config.parallel_config.enable_dbo, routed_output_transform=self._routed_output_transform, - output_scale=self._output_scale, + apply_scale_to_output=self._apply_scale_to_output, + routed_scaling_factor=self.routed_scaling_factor, ) # TODO(bnell): This method is provided as a hook so vllm/lora/layers/fused_moe.py diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index 523a5362da51..b6441ae38e5a 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -63,7 +63,8 @@ def __init__( quant_method: FusedMoEMethodBase, enable_dbo: bool, routed_output_transform: torch.nn.Module | None = None, - output_scale: float | None = None, + apply_scale_to_output: bool = False, + routed_scaling_factor: float = 1.0, ): super().__init__( layer, @@ -75,7 +76,8 @@ def __init__( quant_method, enable_dbo, routed_output_transform=routed_output_transform, - output_scale=output_scale, + apply_scale_to_output=apply_scale_to_output, + routed_scaling_factor=routed_scaling_factor, ) @property diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index 1f277a8007cf..0fd51e49c517 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -220,7 +220,8 @@ def __init__( quant_method: FusedMoEMethodBase, enable_dbo: bool, routed_output_transform: torch.nn.Module | None = None, - output_scale: float | None = None, + apply_scale_to_output: bool = False, + routed_scaling_factor: float = 1.0, ): super().__init__() self.moe_config = moe_config @@ -232,7 +233,8 @@ def __init__( self.enable_dbo = enable_dbo self.enable_eplb = moe_config.moe_parallel_config.enable_eplb self.routed_output_transform = routed_output_transform - self.output_scale = output_scale + self.apply_scale_to_output = apply_scale_to_output + self.routed_scaling_factor = routed_scaling_factor # Needed for string -> FusedMoE layer lookup in custom ops. self.layer_name = layer.layer_name @@ -316,9 +318,6 @@ def _combine_and_reduce( ) -> torch.Tensor: result = states[..., :trunc_size] - if self.output_scale is not None: - result = result * self.output_scale - if not self.moe_config.is_sequence_parallel and ( self.moe_config.tp_size > 1 or self.moe_config.ep_size > 1 ): @@ -513,6 +512,9 @@ def forward( shared_output = tensor_model_parallel_all_reduce(shared_output) result = shared_output + fused_output + if self.apply_scale_to_output: + result = result * self.routed_scaling_factor + result = self._combine_and_reduce(result, og_hidden_dim) return self._maybe_add_zero_expert_output(result) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py index e6100814cad8..c9a7efc431c3 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py @@ -34,7 +34,8 @@ def create_moe_runner( quant_method: FusedMoEMethodBase, enable_dbo: bool, routed_output_transform: torch.nn.Module | None = None, - output_scale: float | None = None, + apply_scale_to_output: bool = False, + routed_scaling_factor: float = 1.0, ) -> MoERunner: runner = DefaultMoERunner( layer, @@ -46,7 +47,8 @@ def create_moe_runner( quant_method, enable_dbo, routed_output_transform=routed_output_transform, - output_scale=output_scale, + apply_scale_to_output=apply_scale_to_output, + routed_scaling_factor=routed_scaling_factor, ) if moe_config.moe_parallel_config.use_dp_chunking: return ChunkingMoERunner(runner) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 1a64299789fb..95b4525cfa3a 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -319,7 +319,12 @@ def __init__( topk_group=getattr(config, "topk_group", 1), prefix=f"{prefix}.experts", scoring_func=getattr(config, "scoring_func", "softmax"), - routed_scaling_factor=self.routed_scaling_factor, + # we do scaling outside, set factor to 1.0 to avoid double mul + # aiter applies routed_scaling_factor internally + routed_scaling_factor=1.0 + if not self.is_rocm_aiter_moe_enabled + else self.routed_scaling_factor, + apply_scale_to_output=not self.is_rocm_aiter_moe_enabled, e_score_correction_bias=self.gate.e_score_correction_bias, enable_eplb=self.enable_eplb, num_redundant_experts=self.n_redundant_experts, diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index fc2636db48ab..fa867c5893fb 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -171,7 +171,7 @@ def __init__( # we do scaling outside, set factor to 1.0 to avoid double mul routed_scaling_factor=1.0, e_score_correction_bias=self.gate.e_score_correction_bias, - output_scale=self.routed_scaling_factor, + apply_scale_to_output=True, ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 52fdd09f1279..ecca590d56c3 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -233,7 +233,7 @@ def __init__( routed_output_transform=self.fc2_latent_proj if self.use_latent_moe else None, - routed_scaling_factor=self.routed_scaling_factor, + apply_scale_to_output=True, ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 789e4260836f..0d7e0f544a21 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -213,11 +213,13 @@ def __init__( topk_group=1, prefix=f"{prefix}.experts", scoring_func="sigmoid", - routed_scaling_factor=self.routed_scaling_factor, + # we do scaling outside, set factor to 1.0 to avoid double mul + routed_scaling_factor=1.0, e_score_correction_bias=self.gate.e_score_correction_bias, enable_eplb=self.enable_eplb, num_redundant_experts=self.n_redundant_experts, is_sequence_parallel=self.is_sequence_parallel, + apply_scale_to_output=True, ) def forward( From 72c0b5e5cd577217064cfb98d384e2b0de7fe7b9 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 3 Mar 2026 22:36:32 -0500 Subject: [PATCH 053/191] fix fp16 scaling factor stuff Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 4 +- .../fused_moe/runner/moe_runner_base.py | 50 ++++++++++++++----- vllm/model_executor/models/deepseek_v2.py | 4 +- vllm/model_executor/models/dots1.py | 3 +- vllm/model_executor/models/nemotron_h.py | 1 + vllm/model_executor/models/openpangu.py | 5 +- 6 files changed, 45 insertions(+), 22 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 57e7dc436874..6b63f7a575e8 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -500,7 +500,9 @@ def __init__( topk_group=topk_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, + routed_scaling_factor=routed_scaling_factor + if not apply_scale_to_output + else 1.0, e_score_correction_bias=e_score_correction_bias, num_fused_shared_experts=self.num_fused_shared_experts, enable_eplb=enable_eplb, diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index 0fd51e49c517..238ca64b7d78 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -233,7 +233,9 @@ def __init__( self.enable_dbo = enable_dbo self.enable_eplb = moe_config.moe_parallel_config.enable_eplb self.routed_output_transform = routed_output_transform - self.apply_scale_to_output = apply_scale_to_output + self.apply_scale_to_output = ( + apply_scale_to_output and routed_scaling_factor != 1.0 + ) self.routed_scaling_factor = routed_scaling_factor # Needed for string -> FusedMoE layer lookup in custom ops. @@ -271,7 +273,8 @@ def _must_reduce_shared_expert_outputs(self) -> bool: early. """ return ( - self.quant_method.moe_kernel is not None + self.shared_experts is not None + and self.quant_method.moe_kernel is not None and self.quant_method.moe_kernel.output_is_reduced() ) @@ -311,7 +314,7 @@ def apply_routed_input_transform( hidden_states if self.shared_experts is not None else None, ) - def _combine_and_reduce( + def _maybe_reduce_output( self, states: torch.Tensor, trunc_size: int, @@ -500,22 +503,43 @@ def forward( self._encode_layer_name(), ) + ############################################################## + + # Extract outputs from result if self.shared_experts is not None: assert isinstance(result, tuple) shared_output, fused_output = result - # Apply output transform (e.g. latent → full dim) - if self.routed_output_transform is not None: - r = self.routed_output_transform(fused_output) - fused_output = r[0] if isinstance(r, tuple) else r - # If combine kernel already reduced fused, reduce shared to match - if self._must_reduce_shared_expert_outputs(): - shared_output = tensor_model_parallel_all_reduce(shared_output) - result = shared_output + fused_output + else: + shared_output = None + fused_output = result + + # Apply output transform (e.g. latent → full dim) + if self.routed_output_transform is not None: + r = self.routed_output_transform(fused_output) + fused_output = r[0] if isinstance(r, tuple) else r + + # If combine kernel already reduced fused, reduce shared to match + if self._must_reduce_shared_expert_outputs(): + assert shared_output is not None + shared_output = tensor_model_parallel_all_reduce(shared_output) if self.apply_scale_to_output: - result = result * self.routed_scaling_factor + # FP16 overflow protection: instead of (shared + fused) * scale, + # compute shared/scale + fused. The decoder layer compensates + # with matching divisions (see DeepseekV2DecoderLayer). + if fused_output.dtype != torch.float16: + fused_output *= self.routed_scaling_factor + elif shared_output is not None: + shared_output *= 1.0 / self.routed_scaling_factor + + if shared_output is not None: + result = shared_output + fused_output + else: + result = fused_output + + ############################################################## - result = self._combine_and_reduce(result, og_hidden_dim) + result = self._maybe_reduce_output(result, og_hidden_dim) return self._maybe_add_zero_expert_output(result) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 95b4525cfa3a..c0d5849a2921 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -321,9 +321,7 @@ def __init__( scoring_func=getattr(config, "scoring_func", "softmax"), # we do scaling outside, set factor to 1.0 to avoid double mul # aiter applies routed_scaling_factor internally - routed_scaling_factor=1.0 - if not self.is_rocm_aiter_moe_enabled - else self.routed_scaling_factor, + routed_scaling_factor=self.routed_scaling_factor, apply_scale_to_output=not self.is_rocm_aiter_moe_enabled, e_score_correction_bias=self.gate.e_score_correction_bias, enable_eplb=self.enable_eplb, diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index fa867c5893fb..f0bbde4bcdbe 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -168,9 +168,8 @@ def __init__( topk_group=config.topk_group, prefix=f"{prefix}.experts", scoring_func=config.scoring_func, - # we do scaling outside, set factor to 1.0 to avoid double mul - routed_scaling_factor=1.0, e_score_correction_bias=self.gate.e_score_correction_bias, + routed_scaling_factor=self.routed_scaling_factor, apply_scale_to_output=True, ) diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index ecca590d56c3..8e313226120a 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -233,6 +233,7 @@ def __init__( routed_output_transform=self.fc2_latent_proj if self.use_latent_moe else None, + routed_scaling_factor=self.routed_scaling_factor, apply_scale_to_output=True, ) diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 0d7e0f544a21..409370a77089 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -213,13 +213,12 @@ def __init__( topk_group=1, prefix=f"{prefix}.experts", scoring_func="sigmoid", - # we do scaling outside, set factor to 1.0 to avoid double mul - routed_scaling_factor=1.0, + routed_scaling_factor=self.routed_scaling_factor, + apply_scale_to_output=True, e_score_correction_bias=self.gate.e_score_correction_bias, enable_eplb=self.enable_eplb, num_redundant_experts=self.n_redundant_experts, is_sequence_parallel=self.is_sequence_parallel, - apply_scale_to_output=True, ) def forward( From 3239666dda8ef7a03bd2137a507c4cfe6161ce0b Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 4 Mar 2026 09:48:44 -0500 Subject: [PATCH 054/191] cleanups Signed-off-by: Bill Nell --- .../fused_moe/runner/moe_runner_base.py | 139 +++++++++++------- vllm/model_executor/models/flex_olmo.py | 1 - vllm/model_executor/models/gpt_oss.py | 1 - vllm/model_executor/models/granitemoe.py | 1 - vllm/model_executor/models/grok1.py | 1 - vllm/model_executor/models/interns1_pro.py | 1 - vllm/model_executor/models/jamba.py | 1 - vllm/model_executor/models/longcat_flash.py | 1 - vllm/model_executor/models/mimo_v2_flash.py | 1 - vllm/model_executor/models/minimax_text_01.py | 1 - vllm/model_executor/models/mixtral.py | 1 - vllm/model_executor/models/olmoe.py | 1 - vllm/model_executor/models/phimoe.py | 1 - .../model_executor/models/transformers/moe.py | 1 - 14 files changed, 82 insertions(+), 70 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index 238ca64b7d78..dae8d41bae89 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -189,6 +189,15 @@ def _moe_forward_shared_fake( ) +def _unpack( + result: torch.Tensor | tuple[torch.Tensor, torch.Tensor], +) -> tuple[torch.Tensor | None, torch.Tensor]: + if isinstance(result, tuple): + return result + else: + return (None, result) + + class MoERunnerBase(MoERunner): """ Abstract base class providing common functionality for MoE runner implementations. @@ -259,36 +268,6 @@ def _select_forward(self, layer: torch.nn.Module) -> Callable: def is_internal_router(self) -> bool: return self.gate is not None - def _must_reduce_shared_expert_outputs(self) -> bool: - """ - The shared_experts are typically computed using the RowParallelLinear - layer. The result of this function is typically used as - the reduce_results argument to the module. - When just tensor-parallel is used, it is not required to reduce - the shared_experts results immediately. Instead we reduce at the - once at the end of the MoE op. (Refer to DeepSeekV2MoE module) - With EP and all2all kernels - this is no longer viable as all - GPU ranks in DP, produce the complete set of hidden_states. - Therefore it is required that we reduce the shared_experts output - early. - """ - return ( - self.shared_experts is not None - and self.quant_method.moe_kernel is not None - and self.quant_method.moe_kernel.output_is_reduced() - ) - - def _maybe_all_reduce_tensor_model_parallel( - self, final_hidden_states: torch.Tensor - ): - """ - Some combine kernels reduce across GPU ranks by default. - """ - if self._must_reduce_shared_expert_outputs(): - return final_hidden_states - else: - return tensor_model_parallel_all_reduce(final_hidden_states) - def apply_routed_input_transform( self, hidden_states: torch.Tensor ) -> tuple[torch.Tensor, torch.Tensor | None]: @@ -314,6 +293,58 @@ def apply_routed_input_transform( hidden_states if self.shared_experts is not None else None, ) + def apply_routed_output_transform( + self, + fused_output: torch.Tensor, + ) -> torch.Tensor: + if self.routed_output_transform is not None: + r = self.routed_output_transform(fused_output) + fused_output = r[0] if isinstance(r, tuple) else r + return fused_output + + def _maybe_apply_output_scale( + self, + shared_output: torch.Tensor | None, + fused_output: torch.Tensor, + ) -> tuple[torch.Tensor | None, torch.Tensor]: + if self.apply_scale_to_output: + # FP16 overflow protection: instead of (shared + fused) * scale, + # compute shared/scale + fused. The decoder layer compensates + # with matching divisions (see DeepseekV2DecoderLayer). + if fused_output.dtype != torch.float16: + fused_output *= self.routed_scaling_factor + elif shared_output is not None: + shared_output *= 1.0 / self.routed_scaling_factor + return shared_output, fused_output + + def _must_reduce_shared_expert_output(self) -> bool: + """ + The shared_experts are typically computed using the RowParallelLinear + layer. The result of this function is typically used as + the reduce_results argument to the module. + When just tensor-parallel is used, it is not required to reduce + the shared_experts results immediately. Instead we reduce at the + once at the end of the MoE op. (Refer to DeepSeekV2MoE module) + With EP and all2all kernels - this is no longer viable as all + GPU ranks in DP, produce the complete set of hidden_states. + Therefore it is required that we reduce the shared_experts output + early. + """ + return ( + self.shared_experts is not None + and self.quant_method.moe_mk is not None + and self.quant_method.moe_mk.output_is_reduced() + ) + + def _maybe_reduce_shared_expert_output( + self, + shared_output: torch.Tensor | None, + ) -> torch.Tensor | None: + if self._must_reduce_shared_expert_output(): + assert shared_output is not None + shared_output = tensor_model_parallel_all_reduce(shared_output) + return shared_output + def _maybe_reduce_output( self, states: torch.Tensor, @@ -321,10 +352,12 @@ def _maybe_reduce_output( ) -> torch.Tensor: result = states[..., :trunc_size] - if not self.moe_config.is_sequence_parallel and ( - self.moe_config.tp_size > 1 or self.moe_config.ep_size > 1 + if ( + not self.moe_config.is_sequence_parallel + and (self.moe_config.tp_size > 1 or self.moe_config.ep_size > 1) + and not self._must_reduce_shared_expert_output() ): - result = self._maybe_all_reduce_tensor_model_parallel(result) + result = tensor_model_parallel_all_reduce(result) return result @@ -503,42 +536,34 @@ def forward( self._encode_layer_name(), ) - ############################################################## + # + # Note: there are two all-reduce points below. They are mutually + # exclusive, controlled by _must_reduce_shared_expert_output(): + # - When True: the combine kernel already reduced fused_output, + # so we reduce shared_output here to match, then skip the + # all-reduce in _maybe_reduce_output. + # - When False: neither output is reduced yet, so we combine + # them first and all-reduce the sum in _maybe_reduce_output. # Extract outputs from result - if self.shared_experts is not None: - assert isinstance(result, tuple) - shared_output, fused_output = result - else: - shared_output = None - fused_output = result + shared_output, fused_output = _unpack(result) # Apply output transform (e.g. latent → full dim) - if self.routed_output_transform is not None: - r = self.routed_output_transform(fused_output) - fused_output = r[0] if isinstance(r, tuple) else r + fused_output = self.apply_routed_output_transform(fused_output) - # If combine kernel already reduced fused, reduce shared to match - if self._must_reduce_shared_expert_outputs(): - assert shared_output is not None - shared_output = tensor_model_parallel_all_reduce(shared_output) + # If combine kernel already reduced fused, reduce shared to match. + # See note above re: the two all-reduce points. + shared_output = self._maybe_reduce_shared_expert_output(shared_output) - if self.apply_scale_to_output: - # FP16 overflow protection: instead of (shared + fused) * scale, - # compute shared/scale + fused. The decoder layer compensates - # with matching divisions (see DeepseekV2DecoderLayer). - if fused_output.dtype != torch.float16: - fused_output *= self.routed_scaling_factor - elif shared_output is not None: - shared_output *= 1.0 / self.routed_scaling_factor + shared_output, fused_output = self._maybe_apply_output_scale( + shared_output, fused_output + ) if shared_output is not None: result = shared_output + fused_output else: result = fused_output - ############################################################## - result = self._maybe_reduce_output(result, og_hidden_dim) return self._maybe_add_zero_expert_output(result) diff --git a/vllm/model_executor/models/flex_olmo.py b/vllm/model_executor/models/flex_olmo.py index 67be99a879ff..1b2047eb231f 100644 --- a/vllm/model_executor/models/flex_olmo.py +++ b/vllm/model_executor/models/flex_olmo.py @@ -76,7 +76,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): top_k=hf_config.num_experts_per_tok, hidden_size=hf_config.hidden_size, intermediate_size=hf_config.intermediate_size, - reduce_results=True, renormalize=False, quant_config=None, tp_size=tp_size, diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 482056250a1e..083defc9c2db 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -186,7 +186,6 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, - reduce_results=True, renormalize=True, quant_config=quant_config, prefix=f"{prefix}.experts", diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 171b2e0ec5a0..f57a8c942bb4 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -104,7 +104,6 @@ def __init__( hidden_size=hidden_size, intermediate_size=intermediate_size, params_dtype=params_dtype, - reduce_results=True, renormalize=True, quant_config=quant_config, tp_size=tp_size, diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 0bd6a8f3d606..c9aa3d2068f0 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -209,7 +209,6 @@ def __init__( hidden_size=hidden_size, intermediate_size=intermediate_size, params_dtype=params_dtype, - reduce_results=True, renormalize=renormalize, quant_config=quant_config, tp_size=tp_size, diff --git a/vllm/model_executor/models/interns1_pro.py b/vllm/model_executor/models/interns1_pro.py index 28331b8ef3e8..9612ea57b2cb 100644 --- a/vllm/model_executor/models/interns1_pro.py +++ b/vllm/model_executor/models/interns1_pro.py @@ -176,7 +176,6 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=True, renormalize=config.norm_topk_prob, quant_config=quant_config, prefix=f"{prefix}.experts", diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 980bcffb5f9b..b4b3b6873db3 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -90,7 +90,6 @@ def __init__( self.intermediate_size, tp_size=tp_size, params_dtype=params_dtype, - reduce_results=True, renormalize=False, use_grouped_topk=False, quant_config=quant_config, diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py index 375b0b69b1f9..945fcb61509b 100644 --- a/vllm/model_executor/models/longcat_flash.py +++ b/vllm/model_executor/models/longcat_flash.py @@ -300,7 +300,6 @@ def __init__( top_k=top_k, hidden_size=hidden_size, intermediate_size=intermediate_size, - reduce_results=True, params_dtype=params_dtype, renormalize=False, quant_config=quant_config, diff --git a/vllm/model_executor/models/mimo_v2_flash.py b/vllm/model_executor/models/mimo_v2_flash.py index 43475ed690c9..0b466f16601a 100644 --- a/vllm/model_executor/models/mimo_v2_flash.py +++ b/vllm/model_executor/models/mimo_v2_flash.py @@ -162,7 +162,6 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=True, renormalize=config.norm_topk_prob, quant_config=quant_config, prefix=f"{prefix}.experts", diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 21d74d8b0580..67d7cb2d8bcb 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -162,7 +162,6 @@ def __init__( hidden_size=self.hidden_size, intermediate_size=self.intermediate_size * self.tp_size, params_dtype=self.params_dtype, - reduce_results=True, renormalize=True, quant_config=self.quant_config, tp_size=self.tp_size, diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 376fd7a1709d..c182444f667d 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -132,7 +132,6 @@ def __init__( hidden_size=hidden_size, intermediate_size=intermediate_size, params_dtype=params_dtype, - reduce_results=True, renormalize=True, quant_config=quant_config, tp_size=tp_size, diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index f0afe0e997cc..fcde2e41afbb 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -98,7 +98,6 @@ def __init__( top_k=top_k, hidden_size=hidden_size, intermediate_size=intermediate_size, - reduce_results=True, renormalize=False, quant_config=quant_config, tp_size=tp_size, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 0b55b7ec8392..7d6083f202e6 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -281,7 +281,6 @@ def __init__( hidden_size=hidden_size, intermediate_size=intermediate_size, params_dtype=params_dtype, - reduce_results=True, renormalize=False, quant_config=quant_config, tp_size=tp_size, diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index a654adcc289d..faafc5037e29 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -279,7 +279,6 @@ def _recursive_replace(module: nn.Module, prefix: str): top_k=top_k, hidden_size=hidden_size, intermediate_size=intermediate_size, - reduce_results=True, renormalize=renormalize, # Hard coded because topk happens in Transformers use_grouped_topk=False, From 3a6cfd7165b3e3706d5c301fa336772a3a56882a Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 4 Mar 2026 10:02:31 -0500 Subject: [PATCH 055/191] add claude generated comments Signed-off-by: Bill Nell --- .../fused_moe/runner/moe_runner_base.py | 112 ++++++++++++++++-- 1 file changed, 99 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index dae8d41bae89..b3d60b871e89 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -77,7 +77,7 @@ def _resolve_layer_name(layer_name: str | ModuleName) -> str: # Note: _moe_forward and _moe_forward_shared should not contain any # implementation details, They should merely pass along control to -# the runner's 'forward_dispatch' method. +# the runner's '_forward_dispatch' method. def _moe_forward( hidden_states: torch.Tensor, router_logits: torch.Tensor, @@ -85,7 +85,7 @@ def _moe_forward( layer_name: _layer_name_type, ) -> torch.Tensor: layer = get_layer_from_name(_resolve_layer_name(layer_name)) - return layer.runner.forward_dispatch( + return layer.runner._forward_dispatch( layer, hidden_states, router_logits, @@ -109,7 +109,7 @@ def _moe_forward_shared( layer_name: str, ) -> tuple[torch.Tensor, torch.Tensor]: layer = get_layer_from_name(layer_name) - return layer.runner.forward_dispatch( + return layer.runner._forward_dispatch( layer, hidden_states, router_logits, @@ -297,6 +297,12 @@ def apply_routed_output_transform( self, fused_output: torch.Tensor, ) -> torch.Tensor: + """Apply transform to routed expert output (e.g., latent to full dim). + + Used by latent MoE models (e.g., NemotronH) where routed experts + operate in a compressed latent space and need projection back to + the full hidden dimension before combining with shared expert output. + """ if self.routed_output_transform is not None: r = self.routed_output_transform(fused_output) fused_output = r[0] if isinstance(r, tuple) else r @@ -307,10 +313,15 @@ def _maybe_apply_output_scale( shared_output: torch.Tensor | None, fused_output: torch.Tensor, ) -> tuple[torch.Tensor | None, torch.Tensor]: + """Apply routed_scaling_factor to the output with FP16 overflow + protection. + + When apply_scale_to_output is True, scales the fused expert output + by routed_scaling_factor. For FP16, avoids overflow by dividing + shared_output by the scale instead (the decoder layer compensates + with matching divisions). + """ if self.apply_scale_to_output: - # FP16 overflow protection: instead of (shared + fused) * scale, - # compute shared/scale + fused. The decoder layer compensates - # with matching divisions (see DeepseekV2DecoderLayer). if fused_output.dtype != torch.float16: fused_output *= self.routed_scaling_factor elif shared_output is not None: @@ -340,6 +351,13 @@ def _maybe_reduce_shared_expert_output( self, shared_output: torch.Tensor | None, ) -> torch.Tensor | None: + """All-reduce shared expert output when the combine kernel already + reduced fused output. + + This is the "early" all-reduce path. When the combine kernel produces + already-reduced fused output, shared output must be reduced separately + to match. See _must_reduce_shared_expert_output for details. + """ if self._must_reduce_shared_expert_output(): assert shared_output is not None shared_output = tensor_model_parallel_all_reduce(shared_output) @@ -350,6 +368,13 @@ def _maybe_reduce_output( states: torch.Tensor, trunc_size: int, ) -> torch.Tensor: + """Truncate padded dimensions and all-reduce the combined output. + + This is the "late" all-reduce path. When neither fused nor shared + output was individually reduced, the combined sum is all-reduced + here. Skipped when sequence-parallel is active (SP handles its + own reduction) or when the early path already reduced both outputs. + """ result = states[..., :trunc_size] if ( @@ -362,6 +387,13 @@ def _maybe_reduce_output( return result def _encode_layer_name(self) -> str | ModuleName: + """Return the layer name string for custom op layer lookup. + + When torch.compile is active, returns "from_forward_context" so the + custom op resolves the layer via ForwardContext at runtime (avoiding + graph breaks). Falls back to the literal layer name for unit tests + or when ForwardContext is unavailable. + """ if HAS_OPAQUE_TYPE: return ModuleName(self.layer_name) # Can be unavailable or None in unittests @@ -377,6 +409,14 @@ def _maybe_pad_hidden_states( shared_experts_input: torch.Tensor | None, hidden_states: torch.Tensor, ) -> tuple[torch.Tensor, int]: + """Pad hidden_states to moe_config.hidden_dim and compute the + original dimension for later truncation. + + For latent MoE, the routed hidden_states may be smaller than + hidden_dim. Padding ensures uniform tensor sizes through the + fused MoE kernel. The returned trunc_size is used by + _maybe_reduce_output to strip the padding from the result. + """ shared_experts_hidden_dim = ( shared_experts_input.shape[-1] if shared_experts_input is not None else 0 ) @@ -404,6 +444,13 @@ def _maybe_apply_shared_experts( shared_experts_input: torch.Tensor | None, order: SharedExpertsOrder, ): + """Trigger shared expert computation at the specified ordering point. + + Shared experts can run at different points relative to routed experts + (EXTERNAL, BEFORE_QUANT_METHOD, AFTER_QUANT_METHOD) depending on the + model's overlap strategy. Only fires if shared experts are configured + and the order matches the shared experts' configured execution point. + """ if self.shared_experts is not None: assert shared_experts_input is not None self.shared_experts.apply(shared_experts_input, order) @@ -415,6 +462,12 @@ def _apply_quant_method( router_logits: torch.Tensor, shared_experts_input: torch.Tensor | None, ) -> tuple[torch.Tensor | None, torch.Tensor]: + """Run expert routing and the fused MoE kernel via the quant method. + + Orchestrates shared expert execution (before/after), expert selection + via the router, and the actual fused MoE computation. Returns + (shared_expert_output, fused_expert_output). + """ # Run this before quant_method to avoid inplace issues. self._maybe_apply_shared_experts( shared_experts_input, @@ -452,6 +505,13 @@ def _apply_quant_method( ) def _sequence_parallel_context(self): + """Return a context manager for sequence-parallel token + redistribution. + + When sequence parallelism is active, returns a context that handles + local size tracking for proper token scatter/gather. Otherwise + returns a no-op context. + """ ctx = get_forward_context() return ( ctx.dp_metadata.sp_local_sizes(self.moe_config.sp_size) @@ -465,6 +525,12 @@ def _maybe_overlap_gate_with_shared_experts( router_logits: torch.Tensor, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: + """Apply the gate module to compute router logits if provided. + + Used in overlapped mode where shared experts run in parallel with + routed experts on a separate CUDA stream. The gate is separated + from the router to allow this parallel execution. + """ # If router/gate provided, then apply it here. # (Note: This code runs only when "overlapped mode" is on to allow # parallel execution of shared experts with the FusedMoE via @@ -481,6 +547,12 @@ def _maybe_add_zero_expert_output( self, result: torch.Tensor, ) -> torch.Tensor: + """Add the zero expert's contribution to the final result. + + When a ZeroExpertRouter is used, it computes a bias-like output + from the "zero expert" that is added to the combined routed+shared + expert output. + """ if isinstance(self.router, ZeroExpertRouter): zero_expert_output = self.router.zero_expert_output assert zero_expert_output is not None @@ -504,24 +576,25 @@ def forward( Calling sequence - forward - self.forward_entry (_moe_forward or _moe_forward_shared custom op) - - forward_dispatch + - _forward_dispatch - _forward_impl Note: The existence of _moe_forward and _moe_forward_shared custom ops are due to the following reasons: 1. the chunking loop in ChunkingMoERunner._forward_impl cannot be compiled by torch.compile - 2. pytorch cannot handle union types in custom op signatures so _moe_forward - and _moe_forward_shared must be split. + 2. pytorch cannot handle union types in custom op signatures so + _moe_forward and _moe_forward_shared must be split. If ChunkingMoERunner._forward_impl can be implemented via torch.scan we can potentially get rid of _moe_forward and _moe_forward_shared and collapse the whole sequence into the 'forward' method. """ - # Apply transform for routed experts (e.g., latent projection for latent MoE) + # Apply transform for routed experts (e.g., latent projection + # for latent MoE) hidden_states, shared_experts_input = self.apply_routed_input_transform( - hidden_states + hidden_states, ) hidden_states, og_hidden_dim = self._maybe_pad_hidden_states( @@ -548,7 +621,7 @@ def forward( # Extract outputs from result shared_output, fused_output = _unpack(result) - # Apply output transform (e.g. latent → full dim) + # Apply output transform (e.g. latent -> full dim) fused_output = self.apply_routed_output_transform(fused_output) # If combine kernel already reduced fused, reduce shared to match. @@ -568,13 +641,19 @@ def forward( return self._maybe_add_zero_expert_output(result) - def forward_dispatch( + def _forward_dispatch( self, layer: torch.nn.Module, hidden_states: torch.Tensor, router_logits: torch.Tensor, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + """Entry point called by the custom op to run the MoE computation. + + Handles pre-dispatch setup (gate application, external shared expert + triggering, quant config init) then delegates to _forward_impl within + the sequence-parallel context. + """ # TODO(bnell): this can be removed after MK migration is complete. layer.ensure_moe_quant_config_init() @@ -605,4 +684,11 @@ def _forward_impl( router_logits: torch.Tensor, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + """Core MoE computation to be implemented by subclasses. + + Performs expert routing, fused MoE kernel execution, and shared + expert computation. Returns a single tensor (fused output only) + or a tuple of (shared_output, fused_output) when shared experts + are present. + """ raise NotImplementedError From 14a395a94485477722a5f74b12642659873343e7 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 4 Mar 2026 10:45:52 -0500 Subject: [PATCH 056/191] move stuff out ot custom op Signed-off-by: Bill Nell --- .../fused_moe/runner/moe_runner_base.py | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index b3d60b871e89..55d4515852c3 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -78,6 +78,8 @@ def _resolve_layer_name(layer_name: str | ModuleName) -> str: # Note: _moe_forward and _moe_forward_shared should not contain any # implementation details, They should merely pass along control to # the runner's '_forward_dispatch' method. +# These functions should never be called directly since they do not +# include all the functionality of the MoE layer. def _moe_forward( hidden_states: torch.Tensor, router_logits: torch.Tensor, @@ -250,7 +252,7 @@ def __init__( # Needed for string -> FusedMoE layer lookup in custom ops. self.layer_name = layer.layer_name - self.forward_entry = self._select_forward(layer) + self._forward_entry = self._select_forward(layer) def _select_forward(self, layer: torch.nn.Module) -> Callable: if current_platform.is_tpu() or current_platform.is_cpu(): @@ -575,7 +577,7 @@ def forward( Calling sequence - forward - - self.forward_entry (_moe_forward or _moe_forward_shared custom op) + - self._forward_entry (_moe_forward or _moe_forward_shared custom op) - _forward_dispatch - _forward_impl @@ -602,7 +604,18 @@ def forward( hidden_states, ) - result = self.forward_entry( + router_logits = self._maybe_overlap_gate_with_shared_experts( + hidden_states, + router_logits, + shared_experts_input, + ) + + self._maybe_apply_shared_experts( + shared_experts_input, + SharedExpertsOrder.EXTERNAL, + ) + + result = self._forward_entry( hidden_states, router_logits, shared_experts_input, @@ -657,17 +670,6 @@ def _forward_dispatch( # TODO(bnell): this can be removed after MK migration is complete. layer.ensure_moe_quant_config_init() - router_logits = self._maybe_overlap_gate_with_shared_experts( - hidden_states, - router_logits, - shared_experts_input, - ) - - self._maybe_apply_shared_experts( - shared_experts_input, - SharedExpertsOrder.EXTERNAL, - ) - with self._sequence_parallel_context(): return self._forward_impl( layer, From 8beaca1b0693add9b084d31fc7808a5c3b2c7ea8 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 4 Mar 2026 15:18:41 -0500 Subject: [PATCH 057/191] fix transformers/moe.py Signed-off-by: Bill Nell --- vllm/model_executor/models/transformers/moe.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index faafc5037e29..baacb528af0d 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -204,6 +204,15 @@ def recursive_replace(self): ) assert intermediate_size is not None + num_shared_experts = getattr_iter( + text_config, + [ + "n_shared_experts", # DeepSeek, Docs, GLM + "moe_num_shared_experts", # Aria, Ernie + ], + 0, + ) + # Unused kwargs since we use custom_routing_function: # - `scoring_func` and `e_score_correction_bias` only used for grouped # topk routing inside vLLM and are non-trivial to infer @@ -243,7 +252,7 @@ def recursive_replace(self): self.num_physical_experts = num_experts + num_redundant_experts self.num_local_physical_experts = self.num_physical_experts // ep_size self.num_routed_experts = num_experts - self.num_shared_experts = 0 + self.num_shared_experts = num_shared_experts self.num_redundant_experts = num_redundant_experts # Recursively fuse MoE layers @@ -267,7 +276,8 @@ def _recursive_replace(module: nn.Module, prefix: str): if "bias" in experts_param_name: has_bias = True break - # Detect shared experts if config doesn't specify + # If the config does not specify num_shared_experts, but + # the model has shared experts, we assume there is one. if self.num_shared_experts == 0: for mlp_param_name, _ in mlp.named_parameters(): if "shared_expert" in mlp_param_name: From 33a010996f492bbb95be59005f93d4a9f3131e49 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 18 Mar 2026 15:26:11 +0000 Subject: [PATCH 058/191] tweak op registration Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index 55d4515852c3..54e41c8ba511 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -177,7 +177,7 @@ def _moe_forward_shared_fake( direct_register_custom_op( op_name="moe_forward", op_func=_moe_forward, - mutates_args=["hidden_states"], # is this still true? + mutates_args=["hidden_states"], fake_impl=_moe_forward_fake, tags=(torch.Tag.needs_fixed_stride_order,), ) From 3c4ce4c2ff97f8af179ad9f96ec3e9f0a70f0b6b Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 18 Mar 2026 16:05:56 +0000 Subject: [PATCH 059/191] fix rebase Signed-off-by: Bill Nell --- .../fused_moe/runner/moe_runner_base.py | 38 +------------------ 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index 54e41c8ba511..547147ba8794 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -104,42 +104,6 @@ def _moe_forward_fake( return torch.empty_like(hidden_states) -def _moe_forward_shared( - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - layer_name: str, -) -> tuple[torch.Tensor, torch.Tensor]: - layer = get_layer_from_name(layer_name) - return layer.runner._forward_dispatch( - layer, - hidden_states, - router_logits, - shared_experts_input, - ) - - -def _moe_forward_shared_fake( - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - layer_name: str, -) -> tuple[torch.Tensor, torch.Tensor]: - # Output shapes: - # - fused_out: same as hidden_states (routed experts use transformed size) - # - shared_out: same as shared_experts_input if provided, else same as - # hidden_states - # (For latent MoE: shared experts use original hidden_size, not latent size) - fused_out = torch.empty_like(hidden_states) - - if shared_experts_input is not None: - shared_out = torch.empty_like(shared_experts_input) - else: - shared_out = torch.empty_like(hidden_states) - - return shared_out, fused_out - - def _moe_forward_shared( hidden_states: torch.Tensor, router_logits: torch.Tensor, @@ -147,7 +111,7 @@ def _moe_forward_shared( layer_name: _layer_name_type, ) -> tuple[torch.Tensor, torch.Tensor]: layer = get_layer_from_name(_resolve_layer_name(layer_name)) - return layer.runner.forward_dispatch( + return layer.runner._forward_dispatch( layer, hidden_states, router_logits, From 47a22603836bc97b8a2017aa0abe49afc42cfe02 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 18 Mar 2026 18:02:35 +0000 Subject: [PATCH 060/191] fix merge Signed-off-by: Bill Nell --- .../model_executor/layers/fused_moe/runner/moe_runner_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index 547147ba8794..ba7ba5806615 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -309,8 +309,8 @@ def _must_reduce_shared_expert_output(self) -> bool: """ return ( self.shared_experts is not None - and self.quant_method.moe_mk is not None - and self.quant_method.moe_mk.output_is_reduced() + and self.quant_method.moe_kernel is not None + and self.quant_method.moe_kernel.output_is_reduced() ) def _maybe_reduce_shared_expert_output( From d8cfa5ad5fe3fa8b4966ff5a1776df39893d9c93 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 25 Mar 2026 03:55:47 +0000 Subject: [PATCH 061/191] fix merge Signed-off-by: Bill Nell --- .../layers/fused_moe/runner/shared_experts.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index 10c1e1cf5060..68f6e2a76bf9 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -155,22 +155,11 @@ def _run_in_aux_stream( return output - def _maybe_reduce_shared_output(self, output: torch.Tensor) -> torch.Tensor: - if ( - self._quant_method.moe_kernel is not None - and self._quant_method.moe_kernel.output_is_reduced() - and get_tensor_model_parallel_world_size() > 1 - ): - output = tensor_model_parallel_all_reduce(output) - return output - @property def output(self) -> torch.Tensor: assert self._output is not None output = self._output self._output = None - if output is not None: - output = self._maybe_reduce_shared_output(output) return output def apply( @@ -188,4 +177,4 @@ def apply( if order == SharedExpertsOrder.MULTI_STREAM_OVERLAPPED: self._output = self._run_in_aux_stream(shared_experts_input) else: - self._output = self._shared_experts(shared_experts_input) + self._output = self._layer(shared_experts_input) From 9b45bdf97ea239742745977aa367cfdc3e17a72d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 25 Mar 2026 04:01:09 +0000 Subject: [PATCH 062/191] fix Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 6b63f7a575e8..c4d5c6126d66 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -48,9 +48,6 @@ from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( SharedExperts, ) -from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( - SharedExperts, -) from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( UnquantizedFusedMoEMethod, ) @@ -657,7 +654,6 @@ def _init_shared_experts(self): # -> SharedExperts | None: # called, i.e. by a MK or by the MoERunner. # Once the MK can be created upfront, we can just pass in the proper # flags derived from the quant_method's MK. - reduce_results=self.reduce_results, quant_method=self.quant_method, ) return From 149947d979c0e3c0bbdee7468fc49ace59de36ab Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 17 Mar 2026 20:52:54 +0000 Subject: [PATCH 063/191] update layer test for MoE refactoring Signed-off-by: Bill Nell --- .../modular_kernel_tools/parallel_utils.py | 32 +++-- tests/kernels/moe/utils.py | 121 +++++++++++------- vllm/model_executor/layers/fused_moe/layer.py | 7 +- 3 files changed, 104 insertions(+), 56 deletions(-) diff --git a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py index 3ff2ce3b3c01..10a226bcd977 100644 --- a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py +++ b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py @@ -11,7 +11,11 @@ from typing_extensions import ParamSpec from vllm.config import VllmConfig, set_current_vllm_config -from vllm.distributed import init_distributed_environment, initialize_model_parallel +from vllm.distributed import ( + cleanup_dist_env_and_memory, + init_distributed_environment, + initialize_model_parallel, +) from vllm.utils.network_utils import get_open_port ## Parallel Processes Utils @@ -36,10 +40,17 @@ def _set_vllm_config( temp_file = tempfile.mkstemp()[1] + # When DP is enabled, processes are organized as: + # rank = dp_rank * tp_pp_world_size + tp_pp_rank + tp_pp_world_size = vllm_config.parallel_config.world_size + vllm_config.parallel_config.data_parallel_rank = rank // tp_pp_world_size + tp_pp_rank = rank % tp_pp_world_size + vllm_config.parallel_config.rank = tp_pp_rank + with set_current_vllm_config(vllm_config): init_distributed_environment( - world_size=world_size, - rank=rank, + world_size=tp_pp_world_size, + rank=tp_pp_rank, distributed_init_method=f"file://{temp_file}", local_rank=local_rank, backend="nccl", @@ -59,11 +70,11 @@ def _worker_parallel_launch( world_local_size: int, node_rank: int, init_method: str, - worker: Callable[Concatenate[ProcessGroupInfo, VllmConfig | None, Any, P], None], + worker: Callable[..., None], vllm_config: VllmConfig | None, env_dict: dict | None, - *args: P.args, - **kwargs: P.kwargs, + worker_kwargs: dict[str, Any], + *args: Any, ) -> None: rank = node_rank * world_local_size + local_rank torch.accelerator.set_device_index(local_rank) @@ -98,14 +109,17 @@ def _worker_parallel_launch( vllm_config, cpu_group, *args, - **kwargs, + **worker_kwargs, ) except Exception as ex: print(ex) traceback.print_exc() raise finally: - torch.distributed.destroy_process_group() + if vllm_config is not None: + cleanup_dist_env_and_memory() + else: + torch.distributed.destroy_process_group() def parallel_launch_with_config( @@ -116,7 +130,6 @@ def parallel_launch_with_config( *args: P.args, **kwargs: P.kwargs, ) -> None: - assert not kwargs spawn( _worker_parallel_launch, args=( @@ -127,6 +140,7 @@ def parallel_launch_with_config( worker, vllm_config, env_dict, + kwargs, ) + args, nprocs=world_size, diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index 2ef4424c2baa..8763ad683517 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -248,7 +248,7 @@ def make_quantized_test_activations( return a, a_q, a_scale -def moe_quantize_weights( +def moe_quantize_weights_2d( w: torch.Tensor, w_s: torch.Tensor | None, quant_dtype: torch.dtype | str | None, @@ -293,6 +293,40 @@ def moe_quantize_weights( return w, w_s, w_gs +def moe_quantize_weights( + w: torch.Tensor, + w_s: torch.Tensor | None, + quant_dtype: torch.dtype | str | None, + per_token_quant: bool, + block_shape: list[int] | None, +) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]: + assert w.dim() == 3 + e, rows, cols = w.shape + w_l = [None] * e + w_s_l = [None] * e + w_gs_l = [None] * e + for idx in range(e): + w_l[idx], w_s_l[idx], w_gs_l[idx] = moe_quantize_weights_2d( + w[idx], None, quant_dtype, per_token_quant, block_shape + ) + + w = torch.stack(w_l) + w_s = torch.stack(w_s_l) + w_gs = torch.stack(w_gs_l) if e > 0 and w_gs_l[0] is not None else None + + if w_s.ndim == 2: + assert w_s.shape[-1] == 1 + w_s = w_s.view(-1, 1, 1) + + if block_shape is not None: + block_n, block_k = block_shape + n_tiles = (rows + block_n - 1) // block_n + k_tiles = (cols + block_k - 1) // block_k + assert w_s.shape == (e, n_tiles, k_tiles) + + return w, w_s, w_gs + + def make_test_weight( e: int, rows: int, @@ -303,30 +337,11 @@ def make_test_weight( per_out_ch_quant: bool = False, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None]: w_16 = torch.randn((e, rows, cols), device="cuda", dtype=in_dtype) / 15 - w_gs = None if quant_dtype is not None: - w_l = [None] * e - w_s_l = [None] * e - w_gs_l = [None] * e - for idx in range(e): - w_l[idx], w_s_l[idx], w_gs_l[idx] = moe_quantize_weights( - w_16[idx], None, quant_dtype, per_out_ch_quant, block_shape - ) - - w = torch.stack(w_l) - w_s = torch.stack(w_s_l) - if e > 0 and w_gs_l[0] is not None: - w_gs = torch.stack(w_gs_l) - if w_s.ndim == 2: - assert w_s.shape[-1] == 1 - w_s = w_s.view(-1, 1, 1) - - if block_shape is not None: - block_n, block_k = block_shape - n_tiles = (rows + block_n - 1) // block_n - k_tiles = (cols + block_k - 1) // block_k - assert w_s.shape == (e, n_tiles, k_tiles) + w, w_s, w_gs = moe_quantize_weights( + w_16, None, quant_dtype, per_out_ch_quant, block_shape + ) else: w = w_16 w_s = None @@ -454,7 +469,6 @@ def fused_moe( ) -# CustomOp? class BaselineMM(torch.nn.Module): def __init__( self, @@ -462,13 +476,22 @@ def __init__( out_dtype: torch.dtype, ): super().__init__() - self.b = b.to(dtype=torch.float32) + self.b = torch.nn.Parameter(b.to(dtype=torch.float32)) self.out_dtype = out_dtype def forward(self, a: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor | None]: return torch.mm(a.to(dtype=torch.float32), self.b).to(self.out_dtype), None +class BaselineSiluAndMul(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + return torch.nn.functional.silu(x[..., :d]) * x[..., d:] + + class TestMLP(torch.nn.Module): def __init__( self, @@ -479,7 +502,7 @@ def __init__( super().__init__() self.gate_up_proj = BaselineMM(w1, out_dtype) self.down_proj = BaselineMM(w2, out_dtype) - self.act_fn = SiluAndMul() + self.act_fn = BaselineSiluAndMul() def forward(self, x): x, _ = self.gate_up_proj(x) @@ -564,35 +587,24 @@ def forward(self, x): return x -def make_shared_experts( +def make_shared_experts_with_weights( N: int, K: int, - in_dtype: torch.dtype = torch.bfloat16, + in_dtype: torch.dtype, + w1: torch.Tensor, + w2: torch.Tensor, + w1_s: torch.Tensor | None = None, + w2_s: torch.Tensor | None = None, quant_dtype: torch.dtype | str | None = None, ) -> torch.nn.Module: - from vllm.model_executor.layers.quantization.fp8 import Fp8Config - - (_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights( - 1, - N, - K, - in_dtype=in_dtype, - quant_dtype=quant_dtype, - ) old_dtype = torch.get_default_dtype() try: torch.set_default_dtype(in_dtype) if quant_dtype == torch.float8_e4m3fn: - w1 = w1[0].transpose(0, 1) - w2 = w2[0].transpose(0, 1) - w1_s = w1_s[0].transpose(0, 1) if w1_s is not None else None - w2_s = w2_s[0].transpose(0, 1) if w2_s is not None else None + from vllm.model_executor.layers.quantization.fp8 import Fp8Config + quant_config = Fp8Config(True) else: - w1 = w1[0] - w2 = w2[0] - w1_s = None - w2_s = None quant_config = None return RealMLP(K, N, w1, w2, "silu", quant_config, w1_s=w1_s, w2_s=w2_s) @@ -614,3 +626,22 @@ def modular_triton_fused_moe( TritonExperts(moe_config, quant_config), inplace=False, ) + + +def make_shared_experts( + N: int, + K: int, + in_dtype: torch.dtype = torch.bfloat16, + quant_dtype: torch.dtype | str | None = None, +) -> torch.nn.Module: + (_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights( + 1, + N, + K, + in_dtype=in_dtype, + quant_dtype=quant_dtype, + ) + + return make_shared_experts_with_weights( + N, K, in_dtype, w1, w2, w1_s=w1_s, w2_s=w2_s, quant_dtype=quant_dtype + ) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index c4d5c6126d66..ad423468f0e5 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -358,6 +358,8 @@ def __init__( assert self.moe_parallel_config.is_sequence_parallel == is_sequence_parallel + logger.debug("FusedMoEParallelConfig = %s", str(self.moe_parallel_config)) + self.global_num_experts = num_experts + num_redundant_experts self.logical_num_experts = num_experts @@ -568,6 +570,8 @@ def __init__( self.quant_config = quant_config + logger.debug("FusedMoEConfig = %s", self.moe_config) + def _get_quant_method() -> FusedMoEMethodBase: """ Helper method to ensure self.quant_method is never None and @@ -1458,8 +1462,7 @@ def _maybe_make_contiguous( or name.startswith("_gate.") or name.startswith("_routed_input_transform.") or name.startswith("_routed_output_transform.") - ) - and name not in NON_EXPERT_WEIGHTS + ) and name not in NON_EXPERT_WEIGHTS ) return [ From ce8465c2256d11d4488ee0f6c8190ab828778db9 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 18 Mar 2026 18:03:35 +0000 Subject: [PATCH 064/191] Add test files Signed-off-by: Bill Nell --- tests/kernels/moe/conftest.py | 14 + tests/kernels/moe/test_moe_layer.py | 1615 +++++++++++++++++++++++++++ 2 files changed, 1629 insertions(+) create mode 100644 tests/kernels/moe/conftest.py create mode 100644 tests/kernels/moe/test_moe_layer.py diff --git a/tests/kernels/moe/conftest.py b/tests/kernels/moe/conftest.py new file mode 100644 index 000000000000..a217fe684eb9 --- /dev/null +++ b/tests/kernels/moe/conftest.py @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + + +def pytest_addoption(parser): + parser.addoption( + "--subtests", action="store", type=str, default=None, help="subtest ids" + ) + + +@pytest.fixture +def subtests(request): + return request.config.getoption("--subtests") diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py new file mode 100644 index 000000000000..dfae667be45b --- /dev/null +++ b/tests/kernels/moe/test_moe_layer.py @@ -0,0 +1,1615 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for the MOE layer. + +Run `pytest tests/kernels/test_moe_layer.py`. +""" + +import functools +import os +import traceback +import types +from collections.abc import Callable +from dataclasses import astuple, dataclass, fields +from itertools import product +from typing import get_args + +import pytest +import torch + +from tests.kernels.moe.modular_kernel_tools.parallel_utils import ( + ProcessGroupInfo, + _set_vllm_config, + parallel_launch_with_config, +) +from tests.kernels.moe.utils import TestMLP, make_test_weights, moe_quantize_weights +from vllm.config import ( + CompilationConfig, + ParallelConfig, + VllmConfig, + set_current_vllm_config, +) +from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace +from vllm.forward_context import set_forward_context +from vllm.model_executor.layers.fused_moe import FusedMoE, SharedFusedMoE, fused_experts +from vllm.model_executor.layers.fused_moe.activation import MoEActivation +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.router.router_factory import ( + create_fused_moe_router, +) +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.modelopt import ( + ModelOptFp8Config, + ModelOptNvFp4Config, +) +from vllm.platforms import current_platform +from vllm.utils.flashinfer import ( + has_flashinfer_nvlink_one_sided, + has_flashinfer_nvlink_two_sided, +) +from vllm.utils.import_utils import has_deep_ep, has_mori, has_nixl_ep +from vllm.utils.math_utils import cdiv +from vllm.utils.torch_utils import cuda_device_count_stateless, set_random_seed +from vllm.v1.worker.workspace import ( + init_workspace_manager, + is_workspace_manager_initialized, +) + +fp8_dtype = torch.float8_e4m3fn # current_platform.fp8_dtype + +SHAPE_COMBOS = [ + (1, 128, 256), + (32, 1024, 512), + (222, 2048, 2048), # should be big enough to exercise DP chunking +] + +NUM_EXPERTS = [8, 64] +TOP_KS = [2, 6] + +# dp_size, tp_size, use_ep +# Note: DP+TP is not yet supported in the FusedMoE layer. +PARALLEL_COMBOS = [ + [1, 2, False], + [1, 4, False], + [2, 1, True], + [4, 1, True], +] + +# TODO: should this even be set manually? let oracles handle this +BACKENDS = ["allgather_reducescatter"] + +if has_mori(): + BACKENDS += ["mori"] + +if has_flashinfer_nvlink_two_sided(): + BACKENDS += ["flashinfer_nvlink_two_sided"] + +if has_flashinfer_nvlink_one_sided(): + BACKENDS += ["flashinfer_nvlink_one_sided"] + +if has_deep_ep(): + BACKENDS += ["deepep_low_latency", "deepep_high_throughput"] + +if has_nixl_ep(): + BACKENDS += ["nixl_ep"] + +QUANT_METHODS = [ + None, + "fp8", + "modelopt_fp8", + "modelopt_fp4", +] + +# Which quantization methods each backend supports. +# fmt: off +BACKEND_SUPPORTED_QUANTS: dict[str, set[str | None]] = { + "allgather_reducescatter": {None, "fp8", "modelopt_fp8", "modelopt_fp4"}, + "mori": {None, "fp8", "modelopt_fp8"}, + "flashinfer_nvlink_two_sided": {None, "modelopt_fp8", "modelopt_fp4"}, + "flashinfer_nvlink_one_sided": {None, "modelopt_fp8", "modelopt_fp4"}, + "deepep_low_latency": {None, "fp8", "modelopt_fp8", "modelopt_fp4"}, + "deepep_high_throughput": {None, "fp8", "modelopt_fp8", "modelopt_fp4"}, + "nixl_ep": {None, "fp8", "modelopt_fp8", "modelopt_fp4"}, # fp4? +} +# fmt: on + +# Which quantization methods support EPLB. +# ModelOptFp8MoEMethod inherits supports_eplb=False from FusedMoEMethodBase. +# TODO: double check modelopt fp8 +# modelopt_fp4 excluded: get_expert_weights() can't handle NvFP4 packed format. +EPLB_SUPPORTED_QUANTS: list[str | None] = [None, "fp8"] + +# Which backends support EPLB. +# deepep backends fail in get_expert_weights / rearrange_expert_weights_inplace. +# TODO(bnell): check this +EPLB_SUPPORTED_BACKENDS: list[str] = ["allgather_reducescatter"] + + +def maybe_roundup_layer_hidden_size( + hidden_size: int, + act_dtype: torch.dtype, + backend: str | None, +) -> int: + """ + Given layer hidden size and MoE configurations, round up hidden_size + if necessary. + + Args: + hidden_size: Layer hidden-size + act_dtype: Data type of the layer activations. + moe_parallel_config: Fused MoE parallelization strategy configuration. + + Return: + Rounded up hidden_size if rounding up is required based on the configs + and all2all backend. + Original hidden size otherwise. + """ + if backend == "deepep_high_throughput": + from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( + DeepEPHTPrepareAndFinalize, + ) + + hidden_size = DeepEPHTPrepareAndFinalize.maybe_roundup_layer_hidden_size( + hidden_size, act_dtype + ) + + if backend == "deepep_low_latency": + from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( + DeepEPLLPrepareAndFinalize, + ) + + hidden_size = DeepEPLLPrepareAndFinalize.maybe_roundup_layer_hidden_size( + hidden_size + ) + + return hidden_size + + +def rank_chunk(num: int, r: int, w: int) -> int: + rem = num % w + return (num // w) + (1 if r < rem else 0) + + +def chunk_by_rank( + t: torch.Tensor, + r: int, + w: int, + dim: int = 0, + device: torch.device | None = None, +) -> torch.Tensor: + chunk = cdiv(t.shape[dim], w) + t = t.narrow(dim, r * chunk, chunk) + if device is not None: + t = t.to(device) + return t + + +def maybe_chunk_by_rank( + t: torch.Tensor | None, + r: int, + w: int, + dim: int = 0, + device: torch.device | None = None, +) -> torch.Tensor | None: + if t is not None: + return chunk_by_rank(t, r, w, dim, device) + else: + return t + + +def tp_chunk_gate_up( + w: torch.Tensor, + tp_rank: int, + tp_size: int, + dim: int, + device: torch.device | int | None = None, +) -> torch.Tensor: + """TP-chunk a combined [gate; up] weight, splitting each half separately + so every rank gets a portion of both gate and up.""" + half = w.shape[dim] // 2 + gate = chunk_by_rank( + w.narrow(dim, 0, half), tp_rank, tp_size, dim=dim, device=device + ) + up = chunk_by_rank( + w.narrow(dim, half, half), tp_rank, tp_size, dim=dim, device=device + ) + return torch.cat([gate, up], dim=dim) + + +@dataclass +class MoETestConfig: + m: int + n: int + k: int + num_experts: int + top_k: int + in_dtype: torch.dtype + quantization: str | None + use_shared_experts: bool + use_gate: bool + use_routed_input_transform: bool + enable_eplb: bool = False + backend: str | None = None + ep_size: int = 1 + dp_size: int = 1 + tp_size: int = 1 + + # TODO: add more error messages + def id(self) -> str: + def proc(s: str) -> str: + return s.removeprefix("torch.") + + id_str = "-".join([proc(str(f)) for f in astuple(self)]) + return f"[{id_str}]" + + # TODO: add more error messages + @staticmethod + def from_id(id: str) -> "MoETestConfig": + id = id[1:-1] + str_values = id.split("-") + + def convert(v: str, ty): + if isinstance(ty, types.UnionType): + sub_ty = list(get_args(ty)) + assert len(sub_ty) == 2 and types.NoneType in sub_ty + sub_ty.remove(types.NoneType) + return sub_ty[0](v) if v != "None" else None + elif ty is torch.dtype: + ty_val = getattr(torch, v, None) + assert isinstance(ty_val, torch.dtype) + return ty_val + elif ty is bool: + return v == "True" + else: + return ty(v) + + values = tuple( + [convert(v, f.type) for v, f in zip(str_values, fields(MoETestConfig))] + ) + return MoETestConfig(*values) + + +def generate_valid_test_configs( + backend: str, + ep_size: int, + dp_size: int, + tp_size: int, + verbosity: int = 0, +) -> list[MoETestConfig]: + configs: list[MoETestConfig] = [] + + for ( + shape, + num_experts, + top_k, + quantization, + use_shared_experts, + use_gate, + use_routed_input_transform, + enable_eplb, + ) in product( + SHAPE_COMBOS, + NUM_EXPERTS, + TOP_KS, + QUANT_METHODS, + [False, True], # shared + [False, True], # gate + [False, True], # routed input exform + [False, True], # eplb + ): + config = MoETestConfig( + shape[0], # m + shape[1], # n + shape[2], # k + num_experts, + top_k, + torch.bfloat16, + quantization, + use_shared_experts, + use_gate, + use_routed_input_transform, + enable_eplb, + backend, + ep_size, + dp_size, + tp_size, + ) + + valid, reason = is_valid_config(config) + if valid: + configs.append(config) + elif verbosity > 1: + print(f"Skipping invalid config {config} - {reason}") + + return configs + + +# TODO: break this up into sections +def is_valid_config(config: MoETestConfig) -> tuple[bool, str | None]: + # routed_input_transform only makes sense with shared_experts (latent MoE) + # TODO: not sure this is true + if config.use_routed_input_transform and not config.use_shared_experts: + return False, "routed_input_transform requires shared_experts" + + # TODO: disable for now + if config.use_routed_input_transform and config.enable_eplb: + return False, "routed_input_transform not supported with EPLB." + + # TODO: disable for now + if config.use_routed_input_transform and config.use_gate: + return ( + False, + "routed_input_transform not supported with gate because of " + "padding problems", + ) + + # TODO: disable for now + if config.use_routed_input_transform and config.backend in [ + "deepep_low_latency", + "deepep_high_throughput", + ]: + return ( + False, + "routed_input_transform not supported with DeepEP backends because " + "of padding problems", + ) + + # routed_input_transform + quantization + high hidden dimensions + # TODO: Disable >= 2048 w/fp8 + deepep LL for now due to insane errors. + if ( + (config.use_routed_input_transform or config.backend == "deepep_low_latency") + and config.quantization is not None + and config.k >= 2048 + ): + return ( + False, + "routed_input_transform + quantization + higher hidden dimensions " + "leads to large differences.", + ) + + # gate requires shared_experts (use_overlapped mode) + # TODO: also not sure this is true + if config.use_gate and not config.use_shared_experts: + return False, "gate requires shared_experts (use_overlapped mode)" + + # Skip modelopt_fp4 if not on B100+ (compute capability 10.0+) + if ( + config.quantization == "modelopt_fp4" + and not current_platform.has_device_capability(100) + ): + return False, "modelopt_fp4 not supported on H100+ GPUs" + + # Skip flashinfer_nvlink if not on B100+ (compute capability 10.0+) + if ( + config.backend is not None + and config.backend.startswith("flashinfer_nvlink") + and not current_platform.has_device_capability(100) + ): + return False, "flashinfer_nvlink not supported on H100+ GPUs" + + # Backend-specific checks + if config.backend is not None: + supported_quants = BACKEND_SUPPORTED_QUANTS.get(config.backend) + if supported_quants is not None and config.quantization not in supported_quants: + return ( + False, + f"{config.backend} does not support quantization={config.quantization}", + ) + + if config.backend == "deepep_low_latency": + from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 + DeepEPLLPrepareAndFinalize, + ) + + if config.k not in DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES: + return ( + False, + f"Skipping unsupported K {config.k} in {config.backend} w/o EP.", + ) + + if config.enable_eplb and config.quantization not in EPLB_SUPPORTED_QUANTS: + return False, f"EPLB not supported with {config.quantization} quantization." + + if config.enable_eplb and config.backend not in EPLB_SUPPORTED_BACKENDS: + return False, f"EPLB not supported with {config.backend}." + + if ( + config.backend is not None + and config.backend.startswith("flashinfer_nvlink") + and config.ep_size > 1 + ): + return False, "flashinfer_nvlink EP not yet supported." + + if config.enable_eplb and config.num_experts % config.dp_size != 0: + return False, "EPLB requires num_experts divisible by ep_size" + + if config.enable_eplb and config.ep_size == 1: + return False, "EPLB only works with EP+DP" + + return True, None + + +def chunk_scales_by_rank( + t: torch.Tensor | None, + r: int, + w: int, + device: torch.device | None = None, +) -> torch.Tensor | None: + if t is not None and t.numel() > 1: + # Calculate start index by summing chunk sizes for all previous ranks + # start = sum(rank_chunk(t.shape[0], i, w) for i in range(r)) + # chunk = rank_chunk(t.shape[0], r, w) + # t = t[start:(start + chunk)] + chunk = rank_chunk(t.shape[0], r, w) + t = t[(r * chunk) : max(t.shape[0], (r + 1) * chunk)] + + if t is not None and device is not None: + t = t.to(device) + + return t + + +def chunk_scales( + t: torch.Tensor | None, + start: int, + end: int, + device: torch.device | None = None, +) -> torch.Tensor | None: + if t is not None and t.numel() > 1: + t = t[start:end] + + if t is not None and device is not None: + t = t.to(device) + + return t + + +@dataclass +class QuantizedWeights: + w13_weight: torch.Tensor + w2_weight: torch.Tensor + w13_weight_scale: torch.Tensor | None = None + w2_weight_scale: torch.Tensor | None = None + w13_weight_scale_2: torch.Tensor | None = None + w2_weight_scale_2: torch.Tensor | None = None + w13_input_scale: torch.Tensor | None = None + w2_input_scale: torch.Tensor | None = None + + +def _quantize_fp8_halves( + w1: torch.Tensor, + w2: torch.Tensor, +) -> QuantizedWeights: + """Quantize w13 gate/up halves separately to FP8, producing per-shard scales.""" + half = w1.shape[1] // 2 + w1q_a, w1s_a, _ = moe_quantize_weights( + w1[:, :half, :], None, fp8_dtype, False, None + ) + w1q_b, w1s_b, _ = moe_quantize_weights( + w1[:, half:, :], None, fp8_dtype, False, None + ) + assert w1s_a is not None and w1s_b is not None + + w2q, w2s, _ = moe_quantize_weights(w2, None, fp8_dtype, False, None) + assert w2s is not None + + return QuantizedWeights( + w13_weight=torch.cat([w1q_a, w1q_b], dim=1), + w2_weight=w2q, + # Each w1s_x is (E, 1, 1) -> reshape to (E, 1), cat to (E, 2) + w13_weight_scale=torch.cat([w1s_a.view(-1, 1), w1s_b.view(-1, 1)], dim=1), + # w2s is (E, 1, 1) -> reshape to (E,) + w2_weight_scale=w2s.view(-1), + ) + + +def quantization_to_quant_dtype( + quantization: str | None, +) -> torch.dtype | str | None: + if quantization is None: + return None + elif quantization in ["fp8", "modelopt_fp8"]: + return fp8_dtype + elif quantization in ["modelopt_fp4"]: + return "nvfp4" + else: + raise NotImplementedError(f"Unsupported quantization: {quantization}") + + +def make_quant_config( + quantization: str | None, + w1: torch.Tensor, + w2: torch.Tensor, + num_experts: int, +) -> tuple[QuantizationConfig | None, QuantizedWeights]: + from vllm.model_executor.layers.quantization.fp8 import Fp8Config + + if quantization is None: + return None, QuantizedWeights(w13_weight=w1, w2_weight=w2) + + if quantization == "fp8": + return Fp8Config(True), _quantize_fp8_halves(w1, w2) + + if quantization == "modelopt_fp8": + qw = _quantize_fp8_halves(w1, w2) + # why? + qw.w13_input_scale = torch.ones( + num_experts, dtype=torch.float32, device=w1.device + ) + # why? + qw.w2_input_scale = torch.ones( + num_experts, dtype=torch.float32, device=w2.device + ) + quant_config = ModelOptFp8Config( + quant_method="FP8", + is_checkpoint_fp8_serialized=True, + kv_cache_quant_method=None, + exclude_modules=[], + ) + return quant_config, qw + + if quantization == "modelopt_fp4": + # Quantize full w13 at once so both gate/up halves share the same + # global scale per expert. process_weights_after_loading uses + # w13_weight_scale_2[:, 0] for the entire tensor, so the two shard + # scales must match. + w1q, w1s, w1gs = moe_quantize_weights(w1, None, "nvfp4", False, None) + assert w1s is not None and w1gs is not None + + w2q, w2s, w2gs = moe_quantize_weights(w2, None, "nvfp4", False, None) + assert w2s is not None and w2gs is not None + + qw = QuantizedWeights( + w13_weight=w1q, + w2_weight=w2q, + w13_weight_scale=w1s, + w2_weight_scale=w2s, + # weight_scale_2 = 1/w_gs: the kernel computes + # g_alphas = a_scale * w_scale_2, and correct dequant needs 1/w_gs. + # Expand per-expert scalar to (E, 2) for the two shards. + w13_weight_scale_2=(1.0 / w1gs).unsqueeze(1).expand(-1, 2).contiguous(), + w2_weight_scale_2=1.0 / w2gs, + w13_input_scale=torch.ones( + (num_experts, 2), dtype=torch.float32, device=w1.device + ), + w2_input_scale=torch.ones( + num_experts, dtype=torch.float32, device=w2.device + ), + ) + quant_config = ModelOptNvFp4Config( + is_checkpoint_nvfp4_serialized=True, + kv_cache_quant_algo=None, + exclude_modules=[], + ) + return quant_config, qw + + raise NotImplementedError(f"Unsupported quantization: {quantization}") + + +@dataclass +class SharedExpertsConfig: + w1: torch.Tensor + w2: torch.Tensor + w1_s: torch.Tensor | None = None + w2_s: torch.Tensor | None = None + quant_dtype: torch.dtype | str | None = None + + +@dataclass +class MoETestData: + """Container for MOE test data and transforms.""" + + w1: torch.Tensor + w2: torch.Tensor + hidden_states: torch.Tensor + router_logits: torch.Tensor + shared_experts_config: SharedExpertsConfig | None + gate: torch.nn.Module | None + routed_input_transform: torch.nn.Module | None + routed_output_transform: torch.nn.Module | None + routed_expert_hidden_size: int + + +class SimpleGate(torch.nn.Module): + """Simple gate module for testing: computes router logits from hidden states.""" + + def __init__( + self, + hidden_size: int, + num_experts: int, + dtype: torch.dtype, + device: str = "cuda", + ): + super().__init__() + self.weight = torch.nn.Parameter( + torch.randn(num_experts, hidden_size, device=device, dtype=dtype) / 10 + ) + + def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, None]: + """Returns (router_logits, None) to match expected signature.""" + router_logits = torch.nn.functional.linear(hidden_states, self.weight) + return router_logits, None + + +class SimpleRoutedInputTransform(torch.nn.Module): + """Simple linear transform for testing routed input transform + (e.g., latent projection). + """ + + def __init__( + self, + in_features: int, + out_features: int, + dtype: torch.dtype, + device: str = "cuda", + ): + super().__init__() + self.weight = torch.nn.Parameter( + torch.randn(out_features, in_features, device=device, dtype=dtype) / 10 + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.nn.functional.linear(x, self.weight) + + +def create_shared_experts_from_config( + shared_experts_config: SharedExpertsConfig | None, + in_dtype: torch.dtype, + tp_size: int = 1, + tp_rank: int = 0, + device: torch.device | str | None = None, +) -> TestMLP | None: + """Create TestMLP for shared experts from config. + + Args: + shared_experts_config: Configuration for shared experts + in_dtype: Output data type + tp_size: Tensor parallel size (for weight chunking) + tp_rank: Tensor parallel rank (for weight chunking) + device: Device to move weights to (optional) + + Returns: + TestMLP instance or None if config is None + """ + if shared_experts_config is None: + return None + + s_w1 = shared_experts_config.w1 + s_w2 = shared_experts_config.w2 + + # Apply TP chunking if needed + if tp_size > 1: + s_w1 = tp_chunk_gate_up(s_w1, tp_rank, tp_size, dim=1, device=device) + s_w2 = chunk_by_rank(s_w2, tp_rank, tp_size, dim=0, device=device) + else: + s_w1 = s_w1.to(device) + s_w2 = s_w2.to(device) + + return TestMLP(w1=s_w1, w2=s_w2, out_dtype=in_dtype) + + +# Make version that takes a MoETestConfig? +def setup_moe_test_data( + m: int, + k: int, + n: int, + num_experts: int, + in_dtype: torch.dtype, + use_shared_experts: bool, + use_gate: bool, + use_routed_input_transform: bool, + backend: str | None, + device: str = "cuda", +) -> MoETestData: + """Setup test data and transforms for MOE tests. + + Args: + m: Number of tokens + k: Hidden size + n: Intermediate size + num_experts: Number of experts + in_dtype: Data type for tensors + use_shared_experts: Whether to create shared experts config + use_gate: Whether to create gate module + use_routed_input_transform: Whether to create routed input/output transforms + device: Device to create tensors on ("cuda" or "cpu") + + Returns: + MoETestData containing all test data and transforms + """ + # For latent MoE: latent_size = k // 2 + latent_size = k // 2 + + # k = maybe_roundup_layer_hidden_size(k, in_dtype, backend) + # latent_size = maybe_roundup_layer_hidden_size(latent_size, in_dtype, backend) + + # Determine dimensions for routed experts (may be transformed) + # For latent MoE, routed experts operate entirely in latent space + # (k//2). The routed_output_transform then projects back to k before + # adding with shared experts. + # w1: (E, 2*N, latent_size) - input latent_size + # w2: (E, latent_size, N) - output latent_size (fused_experts returns + # same shape as input) + routed_expert_hidden_size = latent_size if use_routed_input_transform else k + + # Create expert weights + (w1, _, _, _), (w2, _, _, _) = make_test_weights( + num_experts, + n, + routed_expert_hidden_size, # Both w1 input and w2 output use latent_size + in_dtype=in_dtype, + ) + + # Create shared experts config if needed + if use_shared_experts: + shared_experts_config = SharedExpertsConfig( + w1=torch.randn((k, n * 2), device=device, dtype=in_dtype) / 15, + w2=torch.randn((n, k), device=device, dtype=in_dtype) / 15, + ) + else: + shared_experts_config = None + + # Create routed input transform if needed + routed_input_transform = ( + SimpleRoutedInputTransform(k, latent_size, in_dtype, device=device) + if use_routed_input_transform + else None + ) + + # Create gate if needed + # Note: gate is called AFTER routed_input_transform, so it should expect + # the transformed dimension (latent_size) when routed_input_transform is used + gate_input_dim = latent_size if use_routed_input_transform else k + gate = ( + SimpleGate(gate_input_dim, num_experts, in_dtype, device=device) + if use_gate + else None + ) + + # Create routed output transform if needed (projects latent space back to original) + routed_output_transform = ( + SimpleRoutedInputTransform(latent_size, k, in_dtype, device=device) + if use_routed_input_transform + else None + ) + + # Create test inputs + hidden_states = torch.randn((m, k), device=device, dtype=in_dtype) / 10 + router_logits = torch.randn((m, num_experts), device=device, dtype=in_dtype) + + return MoETestData( + w1=w1, + w2=w2, + hidden_states=hidden_states, + router_logits=router_logits, + shared_experts_config=shared_experts_config, + gate=gate, + routed_input_transform=routed_input_transform, + routed_output_transform=routed_output_transform, + routed_expert_hidden_size=routed_expert_hidden_size, + ) + + +def make_fused_moe_layer( + quantization: str | None, + use_ep: bool, + hidden_size: int, + intermediate_size: int, + in_dtype: torch.dtype, + tp_size: int, + ep_size: int, + dp_size: int, + w1: torch.Tensor, + w2: torch.Tensor, + top_k: int, + global_num_experts: int, + renormalize: bool = False, + shared_experts: torch.nn.Module | None = None, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + indices_type: torch.dtype | None = None, + expert_map: torch.Tensor | None = None, + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + num_redundant_experts: int = 0, + has_bias: bool = False, + gate: torch.nn.Module | None = None, + routed_input_transform: torch.nn.Module | None = None, + routed_output_transform: torch.nn.Module | None = None, + pcp_size: int | None = 1, +) -> FusedMoE: + quant_config, qw = make_quant_config(quantization, w1, w2, global_num_experts) + + kwargs = dict() + if shared_experts is None: + builder = FusedMoE + else: + builder = SharedFusedMoE + kwargs["shared_experts"] = shared_experts + + # Add gate and routed_input_transform if provided + if gate is not None: + kwargs["gate"] = gate + + if routed_input_transform is not None: + kwargs["routed_input_transform"] = routed_input_transform + kwargs["routed_output_transform"] = routed_output_transform + + layer = builder( + num_experts=global_num_experts, + top_k=top_k, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + params_dtype=in_dtype, + renormalize=renormalize, + use_grouped_topk=use_grouped_topk, + num_expert_group=num_expert_group, + topk_group=topk_group, + quant_config=quant_config, + tp_size=tp_size, + ep_size=ep_size, + dp_size=dp_size, + pcp_size=pcp_size, + prefix="from_forward_context", + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + apply_router_weight_on_input=apply_router_weight_on_input, + activation=activation, + enable_eplb=enable_eplb, + num_redundant_experts=num_redundant_experts, + has_bias=has_bias, + **kwargs, + ) + + for name, value in [ + ("w13_weight", qw.w13_weight), + ("w2_weight", qw.w2_weight), + ("w13_weight_scale", qw.w13_weight_scale), + ("w2_weight_scale", qw.w2_weight_scale), + ("w13_weight_scale_2", qw.w13_weight_scale_2), + ("w2_weight_scale_2", qw.w2_weight_scale_2), + ("w13_input_scale", qw.w13_input_scale), + ("w2_input_scale", qw.w2_input_scale), + ]: + if value is not None: + layer.register_parameter( + name, torch.nn.Parameter(value, requires_grad=False) + ) + + layer.quant_method.process_weights_after_loading(layer) + + # Temporary hack until #36286 or #36732 lands + if quantization is None: + layer.maybe_init_modular_kernel() + + return layer + + +def make_fake_moe_layer( + w1: torch.Tensor, + w2: torch.Tensor, + top_k: int, + global_num_experts: int, + in_dtype: torch.dtype, + quant_dtype: torch.dtype | None, + renormalize: bool = False, + shared_experts_config: SharedExpertsConfig | None = None, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + indices_type: torch.dtype | None = None, + expert_map: torch.Tensor | None = None, + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + gate: torch.nn.Module | None = None, + routed_input_transform: torch.nn.Module | None = None, + routed_output_transform: torch.nn.Module | None = None, + use_ep: bool = False, + tp_size: int = 1, + dp_size: int = 1, + ep_size: int = 1, +) -> Callable: + activation = MoEActivation.from_str(activation) + + router = create_fused_moe_router( + top_k=top_k, + global_num_experts=global_num_experts, + # eplb_state=None, # TODO + renormalize=renormalize, + use_grouped_topk=use_grouped_topk, + num_expert_group=num_expert_group, + topk_group=topk_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + num_fused_shared_experts=0, # TODO + enable_eplb=enable_eplb, + # TODO(bnell): once we can construct the MK at init time, we + # can make this a value. + indices_type_getter=lambda: indices_type, + ) + + if quant_dtype is not None: + w1, w1_s, _ = moe_quantize_weights(w1, None, quant_dtype, False, None) + w2, w2_s, _ = moe_quantize_weights(w2, None, quant_dtype, False, None) + else: + w1_s = None + w2_s = None + + shared_experts = create_shared_experts_from_config( + shared_experts_config, in_dtype, 1, 0, "cuda" + ) + + quant_config = FusedMoEQuantConfig.make( + quant_dtype, + w1_scale=w1_s, + w2_scale=w2_s, + ) + + def _moe( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> torch.Tensor: + # Save original hidden_states for shared experts (before transform) + original_hidden_states = hidden_states + + # Apply routed input transform if provided + if routed_input_transform is not None: + hidden_states = routed_input_transform(hidden_states) + + # If gate provided, compute router_logits from hidden_states + # Note: gate operates on transformed hidden_states (after + # routed_input_transform) + if gate is not None: + router_logits, _ = gate(hidden_states) + + topk_weights, topk_ids = router.select_experts( + hidden_states=hidden_states, + router_logits=router_logits, + ) + + # Shared experts use original (untransformed) hidden_states + if shared_experts is not None: + shared_output = shared_experts(original_hidden_states) + else: + shared_output = None + + # Routed experts use transformed hidden_states + output = fused_experts( + hidden_states=hidden_states, + w1=w1, + w2=w2, + quant_config=quant_config, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=False, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + ) + + # Apply routed output transform if provided + # (e.g., latent space -> original space) + if routed_output_transform is not None: + output = routed_output_transform(output) + + if shared_experts is not None: + assert shared_output is not None + output += shared_output + + # Apply TP/DP reduction if not already reduced + # if (tp_size > 1 or dp_size > 1): + # output = tensor_model_parallel_all_reduce(output) + + return output + + return _moe + + +def _test_body_regular( + moe_fn: Callable, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + vllm_config: VllmConfig, + num_tokens: int, + num_tokens_across_dp: torch.Tensor, + **kwargs, +) -> tuple[torch.Tensor, torch.Tensor]: + """Regular MoE test body: compare layer output to baseline.""" + baseline_output = kwargs["baseline_output"] + + with set_forward_context( + None, + vllm_config, + num_tokens=num_tokens, + num_tokens_across_dp=num_tokens_across_dp, + ): + output = moe_fn(hidden_states, router_logits) + + return baseline_output, output + + +def _test_body_eplb( + moe_fn: Callable, + moe_layer: FusedMoE, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + vllm_config: VllmConfig, + num_tokens: int, + num_tokens_across_dp: torch.Tensor, + cpu_group, + in_dtype: torch.dtype, + quantization: str | None, + use_ep: bool, + tp_size: int, + ep_size: int, + dp_size: int, + w1: torch.Tensor, + w2: torch.Tensor, + num_experts: int, + k: int, + n: int, + top_k: int, + shared_experts, + gate: torch.nn.Module | None, + routed_input_transform: torch.nn.Module | None, + routed_output_transform: torch.nn.Module | None, + **kwargs, +) -> tuple[torch.Tensor, torch.Tensor]: + device = torch.accelerator.current_accelerator() + + """EPLB test body: compare output before and after expert weight rearrangement.""" + # Get "before" output with original weight arrangement + with set_forward_context( + None, + vllm_config, + num_tokens=num_tokens, + num_tokens_across_dp=num_tokens_across_dp, + ): + output_before = moe_fn(hidden_states, router_logits) + + # Create a fresh FusedMoE layer with enable_eplb=True + # Delete the original layer's registration so the constructor can + # re-use the same "from_forward_context" prefix + cc = vllm_config.compilation_config + del cc.static_forward_context["from_forward_context"] + cc.static_all_moe_layers.remove("from_forward_context") + + # Determine hidden size for MoE layer + # When using routed_input_transform, experts operate in latent space + hidden_size_for_layer = k // 2 if routed_input_transform is not None else k + + moe_layer = make_fused_moe_layer( + quantization=quantization, + use_ep=use_ep, + hidden_size=hidden_size_for_layer, + intermediate_size=n, + in_dtype=in_dtype, + tp_size=tp_size, + ep_size=ep_size, + dp_size=dp_size, + w1=w1, + w2=w2, + top_k=top_k, + global_num_experts=num_experts, + shared_experts=shared_experts, + enable_eplb=True, + gate=gate, + routed_input_transform=routed_input_transform, + routed_output_transform=routed_output_transform, + ) + + # Necessary? + if moe_layer._expert_map is not None: + moe_layer._expert_map = moe_layer._expert_map.to(device) + + # All ranks must generate the same permutation + initial_indices = torch.arange(num_experts, dtype=torch.long) + shuffled_indices = initial_indices[torch.randperm(num_experts)] + + # Rearrange expert weights across EP ranks + expert_weights = [list(moe_layer.get_expert_weights())] + rearrange_expert_weights_inplace( + old_global_expert_indices=initial_indices.unsqueeze(0), + new_global_expert_indices=shuffled_indices.unsqueeze(0), + expert_weights=expert_weights, + ep_group=cpu_group, + ) + + # Build logical_to_physical_map from shuffled_indices + # shuffled_indices[physical] = logical, we need the inverse + logical_to_physical = torch.empty(num_experts, dtype=torch.int32, device=device) + logical_to_physical[shuffled_indices.to(device)] = torch.arange( + num_experts, dtype=torch.int32, device=device + ) + + moe_layer.set_eplb_state( + moe_layer_idx=0, + expert_load_view=torch.zeros( + (1, num_experts), + dtype=torch.int32, + device=device, + ), + logical_to_physical_map=logical_to_physical.reshape(num_experts, 1).unsqueeze( + 0 + ), + logical_replica_count=torch.ones( + (1, num_experts), + dtype=torch.int32, + device=device, + ), + ) + + # Get "after" output with rearranged weights and EPLB routing + with set_forward_context( + None, + vllm_config, + num_tokens=num_tokens, + num_tokens_across_dp=num_tokens_across_dp, + ): + output_after = moe_layer(hidden_states, router_logits) + + return output_before, output_after + + +# TODO: make this take a MoETestConfig +def _run_one_config( + vllm_config: VllmConfig, + ep_size: int, + dp_size: int, + tp_size: int, + dp_rank: int, + tp_rank: int, + m: int, + n: int, + k: int, + num_experts: int, + top_k: int, + quantization: str | None, + backend: str | None, + test_body_fn: Callable, + use_shared_experts: bool, + use_gate: bool, + use_routed_input_transform: bool, + **kwargs, +) -> None: + set_random_seed(7) + + """Generic test loop that sets up environment and delegates to test_body_fn. + + This function is called directly by test_moe_layer and test_moe_layer_eplb + via parallel_launch_with_config, passing either _test_body_regular or + _test_body_eplb as the test_body_fn parameter. + """ + world_size = tp_size * dp_size + use_ep = ep_size > 1 + + assert vllm_config.parallel_config.enable_expert_parallel == use_ep + + in_dtype = torch.bfloat16 + device = torch.accelerator.current_accelerator() + + if not is_workspace_manager_initialized(): + init_workspace_manager(device) + + # Create test data and transforms + test_data = setup_moe_test_data( + m=m, + k=k, + n=n, + num_experts=num_experts, + in_dtype=in_dtype, + use_shared_experts=use_shared_experts, + use_gate=use_gate, + use_routed_input_transform=use_routed_input_transform, + backend=backend, + device=device, + ) + + # Extract data from test_data + hidden_states = test_data.hidden_states + router_logits = test_data.router_logits + w1 = test_data.w1 + w2 = test_data.w2 + shared_experts_config = test_data.shared_experts_config + gate = test_data.gate + routed_input_transform = test_data.routed_input_transform + routed_output_transform = test_data.routed_output_transform + + baseline_layer = make_fake_moe_layer( + w1=w1, + w2=w2, + top_k=top_k, + global_num_experts=num_experts, + in_dtype=in_dtype, + quant_dtype=None, # quantization_to_quant_dtype(quantization), + renormalize=False, + shared_experts_config=shared_experts_config, + gate=gate, + routed_input_transform=routed_input_transform, + routed_output_transform=routed_output_transform, + use_ep=use_ep, + tp_size=tp_size, + ep_size=ep_size, + dp_size=dp_size, + ) + + baseline_output = baseline_layer(hidden_states, router_logits) + + del baseline_layer + torch.accelerator.empty_cache() + + with set_current_vllm_config(vllm_config): + # Chunk weights for EP/TP (after baseline is created) + if ep_size > 1: + w1 = chunk_by_rank(w1, dp_rank, dp_size, dim=0, device=device) + w2 = chunk_by_rank(w2, dp_rank, dp_size, dim=0, device=device) + + if tp_size > 1: + w1 = tp_chunk_gate_up(w1, tp_rank, tp_size, dim=1, device=device) + w2 = chunk_by_rank(w2, tp_rank, tp_size, dim=2, device=device) + + # Setup shared experts if needed + shared_experts = create_shared_experts_from_config( + shared_experts_config, in_dtype, tp_size, tp_rank, device + ) + + # Determine hidden size for MoE layer + # When using routed_input_transform, experts operate in latent space + hidden_size_for_layer = k // 2 if routed_input_transform is not None else k + + # Create initial MoE layer + moe_layer = make_fused_moe_layer( + quantization=quantization, + use_ep=use_ep, + hidden_size=hidden_size_for_layer, + intermediate_size=n, + in_dtype=in_dtype, + tp_size=tp_size, + ep_size=ep_size, + dp_size=dp_size, + w1=w1, + w2=w2, + top_k=top_k, + global_num_experts=num_experts, + shared_experts=shared_experts, + gate=gate, + routed_input_transform=routed_input_transform, + routed_output_transform=routed_output_transform, + ) + + # Necessary? + if moe_layer._expert_map is not None: + moe_layer._expert_map = moe_layer._expert_map.to(device) + + num_tokens = m + num_tokens_across_dp = torch.tensor( + [num_tokens] * world_size, + device=device, + dtype=torch.int, + ) + + # Call the test body function with all necessary context + expected, actual = test_body_fn( + moe_fn=moe_layer, + moe_layer=moe_layer, + hidden_states=hidden_states, + router_logits=router_logits, + vllm_config=vllm_config, + num_tokens=num_tokens, + num_tokens_across_dp=num_tokens_across_dp, + in_dtype=in_dtype, + quantization=quantization, + use_ep=use_ep, + tp_size=tp_size, + ep_size=ep_size, + dp_size=dp_size, + w1=w1, + w2=w2, + num_experts=num_experts, + k=k, + n=n, + m=m, + top_k=top_k, + shared_experts=shared_experts, + gate=gate, + routed_input_transform=routed_input_transform, + routed_output_transform=routed_output_transform, + baseline_output=baseline_output, + **kwargs, + ) + + # Common tolerance logic + # TODO: consider associating tolerances with quant methods. + if quantization is None: + if k >= 2048: + atol, rtol = 6.5e-2, 6.5e-2 + else: + atol, rtol = 3.5e-2, 3.5e-2 + elif quantization in ("fp8", "modelopt_fp8"): + atol, rtol = 6e-2, 6e-2 + elif quantization == "modelopt_fp4": + atol = rtol = 1e-1 + k * 5e-4 + else: + atol, rtol = 6e-2, 6e-2 + + torch.accelerator.synchronize() # TODO: Is this needed? + torch.testing.assert_close(expected, actual, atol=atol, rtol=rtol) + + +# Test for non-parallel cases (world_size == 1) - backend doesn't matter +@pytest.mark.parametrize("m, n, k", SHAPE_COMBOS) +@pytest.mark.parametrize("num_experts", NUM_EXPERTS) +@pytest.mark.parametrize("top_k", TOP_KS) +@pytest.mark.parametrize("quantization", QUANT_METHODS) +@pytest.mark.parametrize("use_shared_experts", [False, True]) +@pytest.mark.parametrize("use_gate", [False, True]) +@pytest.mark.parametrize("use_routed_input_transform", [False, True]) +def test_moe_layer_no_parallel( + m: int, + n: int, + k: int, + num_experts: int, + top_k: int, + quantization: str | None, + use_shared_experts: bool, + use_gate: bool, + use_routed_input_transform: bool, + monkeypatch, +): + """Test MoE layer without parallelism (dp_size=1, tp_size=1, use_ep=False).""" + + if os.environ.get("VLLM_LOGGING_LEVEL") is None: + monkeypatch.setenv("VLLM_LOGGING_LEVEL", "ERROR") + + test_config = MoETestConfig( + m, + n, + k, + num_experts, + top_k, + torch.bfloat16, + quantization, + use_shared_experts, + use_gate, + use_routed_input_transform, + ) + + valid, reason = is_valid_config(test_config) + if not valid: + pytest.skip(reason) + + set_random_seed(7) + + parallel_config = ParallelConfig() + compilation_config = CompilationConfig() + compilation_config.pass_config.fuse_allreduce_rms = False + + vllm_config = VllmConfig( + parallel_config=parallel_config, compilation_config=compilation_config + ) + + # Initialize distributed environment for single GPU + _set_vllm_config(vllm_config, 1, rank=0, local_rank=0) + + _run_one_config( + vllm_config, + test_config.ep_size, + test_config.dp_size, + test_config.tp_size, + 0, + 0, + test_config.m, + test_config.n, + test_config.k, + test_config.num_experts, + test_config.top_k, + test_config.quantization, + test_config.backend, + _test_body_regular, + use_shared_experts=test_config.use_shared_experts, + use_gate=test_config.use_gate, + use_routed_input_transform=test_config.use_routed_input_transform, + ) + + +def _test_body_config(test_config: MoETestConfig, cpu_group, **kwargs): + if not test_config.enable_eplb: + return _test_body_regular(**kwargs) + else: + return _test_body_eplb(**kwargs, cpu_group=cpu_group) + + +def _parallel_worker( + pgi: ProcessGroupInfo, + vllm_config: VllmConfig, + cpu_group, + test_configs: list[MoETestConfig], + verbosity: int, + **kwargs, +) -> None: + set_random_seed(7) + + total = 0 + passed = 0 + failed = 0 + fail_ids = [] + + dp_rank = vllm_config.parallel_config.data_parallel_rank + + for test_config in test_configs: + cc = vllm_config.compilation_config + if "from_forward_context" in cc.static_forward_context: + del cc.static_forward_context["from_forward_context"] + cc.static_all_moe_layers.remove("from_forward_context") + + tp_rank = pgi.rank % test_config.tp_size + + if verbosity > 0: + print(f"subtest: {test_config.id()}", end="") + + try: + _run_one_config( + vllm_config, + test_config.ep_size, + test_config.dp_size, + test_config.tp_size, + dp_rank, + tp_rank, + test_config.m, + test_config.n, + test_config.k, + test_config.num_experts, + test_config.top_k, + test_config.quantization, + test_config.backend, + functools.partial( + _test_body_config, test_config=test_config, cpu_group=cpu_group + ), + use_shared_experts=test_config.use_shared_experts, + use_gate=test_config.use_gate, + use_routed_input_transform=test_config.use_routed_input_transform, + ) + if verbosity > 0: + print(" PASSED") + else: + print(".", end="") + passed = passed + 1 + except Exception as ex: + fail_ids.append(test_config.id()) + failed = failed + 1 + if verbosity > 0: + traceback.print_exc() + print(f"\n{str(ex)}\nFAILED") + else: + print("F", end="") + finally: + total = total + 1 + + skipped = total - (passed + failed) + + fails = f"{failed} failed" if failed > 0 else "" + sep = ", " if fails != "" else "" + skips = f"{sep}{skipped} skipped" if skipped > 0 else "" + sep = ", " if skips != "" or fails != "" else "" + passes = f"{sep}{passed} passed" if passed > 0 else "" + + report = ( + f"============= {fails}{skips}{passes} of {total} total tests =============" + ) + + sep = "\n" if verbosity == 0 else "" + print(f"{sep}{report}") + + if failed > 0: + fail_ids_str = "\n".join(fail_ids) + raise RuntimeError( + f"\n============= Failed subtests =============\n{fail_ids_str}\n{report}" + ) + + +# TODO: add cudagraphs/torch.compile tests +@pytest.mark.parametrize("dp_size, tp_size, use_ep", PARALLEL_COMBOS) +@pytest.mark.parametrize("backend", BACKENDS) +def test_moe_layer( + dp_size: int, + tp_size: int, + use_ep: bool, + backend: str, + monkeypatch, + pytestconfig, + subtests, +): + """Test MoE layer with parallelism (multi-GPU or TP/EP enabled). + + For non-parallel cases (world_size == 1), use test_moe_layer_no_parallel instead. + """ + num_gpus = cuda_device_count_stateless() + world_size = tp_size * dp_size + ep_size = 1 if not use_ep else world_size # or dp_size? + assert world_size > 1 + + # Check if enough GPUs available + if world_size is not None and num_gpus is not None and world_size > num_gpus: + pytest.skip(f"Not enough GPUs got {num_gpus}, expected {world_size}.") + + verbosity = pytestconfig.getoption("verbose") + + test_env = dict() + test_env["VLLM_MOE_DP_CHUNK_SIZE"] = "128" + monkeypatch.setenv("VLLM_MOE_DP_CHUNK_SIZE", "128") + if os.environ.get("VLLM_LOGGING_LEVEL") is None: + monkeypatch.setenv("VLLM_LOGGING_LEVEL", "ERROR") + + # TODO + # VLLM_FLASHINFER_MOE_BACKEND=latency + # VLLM_USE_FLASHINFER_MOE_FP16=1 + # VLLM_USE_FLASHINFER_MOE_FP8 + # VLLM_USE_FLASHINFER_MOE_FP4 + # VLLM_USE_FLASHINFER_MOE_INT4 + + parallel_config = ParallelConfig( + pipeline_parallel_size=1, + data_parallel_size=dp_size, + tensor_parallel_size=tp_size, + enable_expert_parallel=use_ep, + all2all_backend=backend, + ) + + compilation_config = CompilationConfig() + # compilation_config.mode = CompilationMode.NONE # for now + compilation_config.pass_config.fuse_allreduce_rms = False # for now + + vllm_config = VllmConfig( + parallel_config=parallel_config, compilation_config=compilation_config + ) + + test_configs = generate_valid_test_configs( + backend, ep_size, dp_size, tp_size, verbosity + ) + + if subtests is not None: + new_test_configs = [] + for subtest in subtests.split(","): + sub_test_config = MoETestConfig.from_id(subtest) + if sub_test_config in test_configs: + new_test_configs.append(sub_test_config) + else: + pytest.skip( + "subtest config does not match any valid test configuration" + ) + test_configs = new_test_configs + + try: + parallel_launch_with_config( + world_size, + _parallel_worker, + vllm_config, + test_env, + test_configs, + verbosity, + ) + finally: + torch.accelerator.synchronize() # TODO: Is this needed? + torch.accelerator.empty_cache() From d81320301ea157b0ea9cce706b0320cf786d0de1 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 18 Mar 2026 18:31:16 +0000 Subject: [PATCH 065/191] fix Signed-off-by: Bill Nell --- tests/kernels/moe/test_moe_layer.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index dfae667be45b..e5212759a947 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -1602,14 +1602,17 @@ def test_moe_layer( test_configs = new_test_configs try: - parallel_launch_with_config( - world_size, - _parallel_worker, - vllm_config, - test_env, - test_configs, - verbosity, - ) + if len(test_configs) > 0: + parallel_launch_with_config( + world_size, + _parallel_worker, + vllm_config, + test_env, + test_configs, + verbosity, + ) + else: + pytest.skip("No valid test configs for current parallel config.") finally: torch.accelerator.synchronize() # TODO: Is this needed? torch.accelerator.empty_cache() From 2d4eac0873391fe85cbbcc134558e8b76c9c398d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 18 Mar 2026 19:33:38 +0000 Subject: [PATCH 066/191] remove cruft Signed-off-by: Bill Nell --- tests/kernels/moe/test_moe_layer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index e5212759a947..4dd391e86cdd 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -1257,9 +1257,6 @@ def _run_one_config( baseline_output = baseline_layer(hidden_states, router_logits) - del baseline_layer - torch.accelerator.empty_cache() - with set_current_vllm_config(vllm_config): # Chunk weights for EP/TP (after baseline is created) if ep_size > 1: From e1b52ed583d4c96e384d2b95b47957b45a184fc8 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 18 Mar 2026 23:48:48 +0000 Subject: [PATCH 067/191] add better skip msg Signed-off-by: Bill Nell --- tests/kernels/moe/test_moe_layer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index 4dd391e86cdd..ea02c9a3edb1 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -1594,7 +1594,8 @@ def test_moe_layer( new_test_configs.append(sub_test_config) else: pytest.skip( - "subtest config does not match any valid test configuration" + f"subtest config {subtest} does not match any valid test " + "configuration" ) test_configs = new_test_configs From 8856700c7549410b17a36d6d195f881defa48b2e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 25 Mar 2026 04:00:30 +0000 Subject: [PATCH 068/191] fix merge Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index ad423468f0e5..9bcfac5f0e79 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1462,7 +1462,8 @@ def _maybe_make_contiguous( or name.startswith("_gate.") or name.startswith("_routed_input_transform.") or name.startswith("_routed_output_transform.") - ) and name not in NON_EXPERT_WEIGHTS + ) + and name not in NON_EXPERT_WEIGHTS ) return [ From 73cddf97c82d915488aa3ccf4c2bf13fb66a121e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 25 Mar 2026 04:12:38 +0000 Subject: [PATCH 069/191] fix test imports Signed-off-by: Bill Nell --- tests/kernels/moe/test_moe_layer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index ea02c9a3edb1..e55bbe640af3 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -145,7 +145,7 @@ def maybe_roundup_layer_hidden_size( Original hidden size otherwise. """ if backend == "deepep_high_throughput": - from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( + from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ht import ( DeepEPHTPrepareAndFinalize, ) @@ -154,7 +154,7 @@ def maybe_roundup_layer_hidden_size( ) if backend == "deepep_low_latency": - from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( + from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ll import ( DeepEPLLPrepareAndFinalize, ) @@ -397,7 +397,7 @@ def is_valid_config(config: MoETestConfig) -> tuple[bool, str | None]: ) if config.backend == "deepep_low_latency": - from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 + from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ll import ( # noqa: E501 DeepEPLLPrepareAndFinalize, ) From dff6766cba8b6386ebd275709300e2353741768e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 19 Mar 2026 22:12:49 +0000 Subject: [PATCH 070/191] part 1 Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 9 ++------- .../layers/fused_moe/runner/shared_experts.py | 15 ++------------- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 9bcfac5f0e79..f576f91eefd9 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -665,12 +665,7 @@ def _init_shared_experts(self): # -> SharedExperts | None: self.shared_experts = SharedExperts( self._shared_experts, moe_config=self.moe_config, - # Note: For now we must pass quant_method along to SharedExperts so it - # can property determine where the shared experts are supposed to be - # called, i.e. by a MK or by the MoERunner. - # Once the MK can be created upfront, we can just pass in the proper - # flags derived from the quant_method's MK. - quant_method=self.quant_method, + mk_owns_shared_expert=self.quant_method.mk_owns_shared_expert, ) def _init_runner(self) -> MoERunner: @@ -683,11 +678,11 @@ def _init_runner(self) -> MoERunner: moe_config=self.moe_config, router=self.router, routed_input_transform=self._routed_input_transform, + routed_output_transform=self._routed_output_transform, gate=self._gate, shared_experts=self.shared_experts, quant_method=self.quant_method, enable_dbo=self.vllm_config.parallel_config.enable_dbo, - routed_output_transform=self._routed_output_transform, apply_scale_to_output=self._apply_scale_to_output, routed_scaling_factor=self.routed_scaling_factor, ) diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index 68f6e2a76bf9..b318e9eb3938 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -9,9 +9,6 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, ) -from vllm.model_executor.layers.quantization.base_config import ( - QuantizeMethodBase, -) from vllm.platforms import current_platform from vllm.utils.torch_utils import ( aux_stream, @@ -45,20 +42,12 @@ def __init__( self, layer: torch.nn.Module, moe_config: FusedMoEConfig, - quant_method: QuantizeMethodBase, + mk_owns_shared_expert: bool, ): - from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( - FusedMoEMethodBase, - ) - - # quant_method must be a FusedMoEMethodBase but we can't use the type - # due to circular imports. - assert isinstance(quant_method, FusedMoEMethodBase) - self._output: torch.Tensor | None = None self._layer = layer self._moe_config = moe_config - self._quant_method = quant_method + self._mk_owns_shared_expert = mk_owns_shared_expert self._use_dp_chunking = moe_config.moe_parallel_config.use_dp_chunking # Allow disabling of the separate shared experts stream for From 9dc517644dc485faaa5e212233c264706a242bd6 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 19 Mar 2026 22:58:52 +0000 Subject: [PATCH 071/191] _get_quant_method Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 41 +++++++++++-------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index f576f91eefd9..5260c97cd609 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -572,22 +572,11 @@ def __init__( logger.debug("FusedMoEConfig = %s", self.moe_config) - def _get_quant_method() -> FusedMoEMethodBase: - """ - Helper method to ensure self.quant_method is never None and - of the proper type. - """ - quant_method = None - if self.quant_config is not None: - quant_method = self.quant_config.get_quant_method(self, prefix) - if quant_method is None: - quant_method = UnquantizedFusedMoEMethod(self.moe_config) - assert isinstance(quant_method, FusedMoEMethodBase) - return quant_method - - # Note: get_quant_method will look at the layer's local_num_experts - # for heuristic purposes, so it must be initialized first. - self.quant_method: FusedMoEMethodBase = _get_quant_method() + self.quant_method = self._get_quant_method( + prefix, + quant_config, + self.moe_config, + ) if not self.moe_config.is_act_and_mul and not current_platform.is_cuda_alike(): raise NotImplementedError( @@ -638,7 +627,25 @@ def _get_quant_method() -> FusedMoEMethodBase: self.shared_experts: SharedExperts | None = None self.runner = self._init_runner() - def _init_shared_experts(self): # -> SharedExperts | None: + def _get_quant_method( + self, + prefix: str, + quant_config: QuantizationConfig | None, + moe_config: FusedMoEConfig, + ) -> FusedMoEMethodBase: + """ + Helper method to ensure self.quant_method is never None and + of the proper type. + """ + quant_method = None + if quant_config is not None: + quant_method = quant_config.get_quant_method(self, prefix) + if quant_method is None: + quant_method = UnquantizedFusedMoEMethod(moe_config) + assert isinstance(quant_method, FusedMoEMethodBase) + return quant_method + + def _init_shared_experts(self): if self._shared_experts is None: return From 77f473be1215f97c1dbeadf7d4546dd88961727c Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 23 Mar 2026 23:02:12 +0000 Subject: [PATCH 072/191] wip Signed-off-by: Bill Nell --- .../distributed/elastic_ep/elastic_execute.py | 5 +- vllm/model_executor/layers/fused_moe/layer.py | 165 ++++++++---------- .../fused_moe/runner/default_moe_runner.py | 5 +- .../layers/fused_moe/runner/moe_runner.py | 22 +++ .../fused_moe/runner/moe_runner_base.py | 30 +++- .../fused_moe/runner/moe_runner_factory.py | 5 +- .../model_executor/layers/quantization/fp8.py | 5 + vllm/model_executor/models/deepseek_v2.py | 2 +- 8 files changed, 132 insertions(+), 107 deletions(-) diff --git a/vllm/distributed/elastic_ep/elastic_execute.py b/vllm/distributed/elastic_ep/elastic_execute.py index 8b05c58eaec5..e911ad66764f 100644 --- a/vllm/distributed/elastic_ep/elastic_execute.py +++ b/vllm/distributed/elastic_ep/elastic_execute.py @@ -405,10 +405,11 @@ def switch_and_prepare(self) -> None: ) # Force re-creation of the modular kernel (and all2all manager) # for the new EP size by resetting quant_method to base + # TODO(bnell): this is a hack and will not work for MKs created in + # the new style. for module in moe_modules: if hasattr(module.quant_method, "old_quant_method"): - module.quant_method = module.quant_method.old_quant_method - module.runner = module._init_runner() + module._replace_quant_method(module.quant_method.old_quant_method) prepare_communication_buffer_for_model(self.worker.model_runner.model) if ( self.worker.vllm_config.compilation_config.mode diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 5260c97cd609..dcc13f76031c 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -24,7 +24,6 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, - FusedMoEQuantConfig, RoutingMethodType, ) from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( @@ -318,8 +317,6 @@ def __init__( ): super().__init__() - self._routed_input_transform = routed_input_transform - self._routed_output_transform = routed_output_transform self._apply_scale_to_output = apply_scale_to_output if params_dtype is None: @@ -487,6 +484,8 @@ def __init__( self.apply_router_weight_on_input = apply_router_weight_on_input self.activation = MoEActivation.from_str(activation) + self.runner: MoERunner + # TODO(bnell): we should not have to create a router if the kernel is # monolithic. self.router = create_fused_moe_router( @@ -507,7 +506,7 @@ def __init__( enable_eplb=enable_eplb, # TODO(bnell): once we can construct the MK at init time, we # can make this a value. - indices_type_getter=lambda: self.quant_method.topk_indices_dtype, + indices_type_getter=lambda: self.runner.quant_method.topk_indices_dtype, zero_expert_type=zero_expert_type, num_logical_experts=self.logical_num_experts, ) @@ -572,7 +571,7 @@ def __init__( logger.debug("FusedMoEConfig = %s", self.moe_config) - self.quant_method = self._get_quant_method( + quant_method = self._get_quant_method( prefix, quant_config, self.moe_config, @@ -583,7 +582,7 @@ def __init__( "is_act_and_mul=False is supported only for CUDA and ROCm for now" ) - if self.enable_eplb and not self.quant_method.supports_eplb: + if self.enable_eplb and not quant_method.supports_eplb: # TODO: Add support for additional quantization methods. # The implementation for other quantization methods does not # contain essential differences, but the current quant API @@ -592,7 +591,7 @@ def __init__( # If you plan to add support for more quantization methods, # please refer to the implementation in `Fp8MoEMethod`. raise NotImplementedError( - f"EPLB is not supported {self.quant_method.__class__.__name__}." + f"EPLB is not supported {quant_method.__class__.__name__}." ) moe_quant_params = { @@ -605,27 +604,22 @@ def __init__( "global_num_experts": self.global_num_experts, } # need full intermediate size pre-sharding for WNA16 act order - if self.quant_method.__class__.__name__ in ( + if quant_method.__class__.__name__ in ( "GPTQMarlinMoEMethod", "CompressedTensorsWNA16MarlinMoEMethod", "CompressedTensorsWNA16MoEMethod", ): moe_quant_params["intermediate_size_full"] = intermediate_size - self.quant_method.create_weights(layer=self, **moe_quant_params) - - # TODO(bnell): Why is this needed? Can probably be removed. - self.base_quant_method = self.quant_method + quant_method.create_weights(layer=self, **moe_quant_params) - # Note: for now, the layer must keep _gate and _shared_experts. - # This is because a number of locations swap out the quant_method - # which requires re-initializing the SharedExperts and DefaultMoERunner. - # Once we've figured out alternatives to swapping out the quant_method, - # we can move ownership of _gate and _shared_experts into the runner. - self._gate = gate - self._shared_experts = shared_experts - self.shared_experts: SharedExperts | None = None - self.runner = self._init_runner() + self.runner = self._init_runner( + quant_method=quant_method, + gate=gate, + shared_experts=shared_experts, + routed_input_transform=routed_input_transform, + routed_output_transform=routed_output_transform, + ) def _get_quant_method( self, @@ -634,7 +628,7 @@ def _get_quant_method( moe_config: FusedMoEConfig, ) -> FusedMoEMethodBase: """ - Helper method to ensure self.quant_method is never None and + Helper method to ensure quant_method is never None and of the proper type. """ quant_method = None @@ -645,37 +639,14 @@ def _get_quant_method( assert isinstance(quant_method, FusedMoEMethodBase) return quant_method - def _init_shared_experts(self): - if self._shared_experts is None: - return - - # Note: If the SharedExperts already exist, we reinitialize - # them in place. This is because the MK might be holding a - # reference to the same SharedExperts object. If we create a - # new instance, the MK will still be holding onto the old one, - # including the old quant_method. This is a workaround for - # UnquantizedFusedMoEMethod's handling of MK initialization - # which should be fixed by #36732. - if self.shared_experts is not None: - self.shared_experts.__init__( # type: ignore - self._shared_experts, - moe_config=self.moe_config, - # Note: For now we must pass quant_method along to SharedExperts so it - # can property determine where the shared experts are supposed to be - # called, i.e. by a MK or by the MoERunner. - # Once the MK can be created upfront, we can just pass in the proper - # flags derived from the quant_method's MK. - quant_method=self.quant_method, - ) - return - - self.shared_experts = SharedExperts( - self._shared_experts, - moe_config=self.moe_config, - mk_owns_shared_expert=self.quant_method.mk_owns_shared_expert, - ) - - def _init_runner(self) -> MoERunner: + def _init_runner( + self, + quant_method: FusedMoEMethodBase, + gate: torch.nn.Module | None, + shared_experts: torch.nn.Module | None, + routed_input_transform: torch.nn.Module | None = None, + routed_output_transform: torch.nn.Module | None = None, + ) -> MoERunner: # Storing the runner in the FusedMoE is an intermediate state, eventually # the runner will own the FusedMoE layer and provide the execution interface # for MoE ops. @@ -684,41 +655,54 @@ def _init_runner(self) -> MoERunner: layer=self, moe_config=self.moe_config, router=self.router, - routed_input_transform=self._routed_input_transform, - routed_output_transform=self._routed_output_transform, - gate=self._gate, - shared_experts=self.shared_experts, - quant_method=self.quant_method, + routed_input_transform=routed_input_transform, + routed_output_transform=routed_output_transform, + gate=gate, + shared_experts=shared_experts, + quant_method=quant_method, enable_dbo=self.vllm_config.parallel_config.enable_dbo, apply_scale_to_output=self._apply_scale_to_output, routed_scaling_factor=self.routed_scaling_factor, ) # TODO(bnell): This method is provided as a hook so vllm/lora/layers/fused_moe.py + # and vllm/distributed/elastic_ep/elastic_execute.py # can safely swap out the quant_method. We should figure out a less # intrusive way to do this. def _replace_quant_method(self, mk: FusedMoEMethodBase): - self.quant_method = mk - # We need to force reconstruction of runner because we're swapping out - # the quant_method with a FusedMoEModularMethod. This logic can go - # away once the FusedMoEModularMethod is eliminated. - self.runner = self._init_runner() + self.runner._replace_quant_method(mk) # Note: maybe_init_modular_kernel should only be called by # prepare_communication_buffer_for_model. # This is called after all weight loading and post-processing, so it # should be safe to swap out the quant_method. def maybe_init_modular_kernel(self) -> None: + print( + f"GOT HERE {self.runner.quant_method} " + "{self.runner.quant_method.moe_kernel} " + "{self.runner.quant_method.supports_internal_mk} " + "{self.runner.quant_method.is_monolithic}" + ) + # NOTE(rob): WIP refactor. For quant methods that own the MK # we create the MK during process_weights_after_loading. - if self.quant_method.supports_internal_mk or self.quant_method.is_monolithic: + if ( + self.runner.quant_method.supports_internal_mk + or self.runner.quant_method.is_monolithic + ): return None self.ensure_moe_quant_config_init() # routing_tables only needed for round-robin expert placement with # DeepEP all2all backend. routing_tables = self._maybe_init_expert_routing_tables() - prepare_finalize = self.base_quant_method.maybe_make_prepare_finalize( + + if isinstance(self.runner.quant_method, FusedMoEModularMethod): + base_quant_method = self.runner.quant_method.old_quant_method + else: + base_quant_method = self.runner.quant_method + + prepare_finalize = base_quant_method.maybe_make_prepare_finalize( routing_tables=routing_tables ) if prepare_finalize is not None: @@ -728,7 +712,7 @@ def maybe_init_modular_kernel(self) -> None: self._replace_quant_method( FusedMoEModularMethod.make( self, - self.base_quant_method, + base_quant_method, prepare_finalize, self.shared_experts, inplace=not self.moe_config.disable_inplace, @@ -765,7 +749,15 @@ def use_ep(self): @property def is_internal_router(self) -> bool: # By default, router/gate is called before FusedMoE forward pass - return self.runner.is_internal_router() + return self.runner.is_internal_router + + @property + def is_monolithic(self) -> bool: + return self.runner.quant_method.is_monolithic + + @property + def shared_experts(self) -> SharedExperts | None: + return self.runner.shared_experts def _maybe_init_expert_routing_tables( self, @@ -1102,12 +1094,12 @@ def weight_loader( param.data[:, :dim1, :dim2].copy_(loaded_weight) return True if return_success else None - quant_method_name = self.quant_method.__class__.__name__ + quant_method_name = self.runner.quant_method.__class__.__name__ global_expert_id = expert_id expert_id = self._map_global_expert_id_to_local_expert_id(global_expert_id) use_global_sf = ( - getattr(self.quant_method, "use_global_sf", False) + getattr(self.runner.quant_method, "use_global_sf", False) and "input_scale" in weight_name ) @@ -1122,7 +1114,7 @@ def weight_loader( is_transposed = getattr(param, "is_transposed", False) # compressed-tensors checkpoints with packed weights are stored flipped - # TODO (mgoin): check self.quant_method.quant_config.quant_format + # TODO (mgoin): check self.runner.quant_method.quant_config.quant_format # against known CompressionFormat enum values that have this quality if quant_method_name in ( "CompressedTensorsWNA16MarlinMoEMethod", @@ -1229,7 +1221,7 @@ def weight_loader( if "ModelOpt" in quant_method_name: # Determine per-tensor weight scale patterns based on variant # Use the dedicated method instead of brittle string matching - uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern() + uses_weight_scale_2 = self.runner.quant_method.uses_weight_scale_2_pattern() quant_method = getattr(param, "quant_method", None) # Call _load_per_tensor_weight_scale() to load per-tensor (scalar) @@ -1459,26 +1451,13 @@ def _maybe_make_contiguous( assert all( weight.is_contiguous() for name, weight in weights - if not ( - name.startswith("_shared_experts.") - or name.startswith("_gate.") - or name.startswith("_routed_input_transform.") - or name.startswith("_routed_output_transform.") - ) - and name not in NON_EXPERT_WEIGHTS + if name not in NON_EXPERT_WEIGHTS ) return [ weight.view(self.local_num_experts, -1) for name, weight in weights - if name not in NON_EXPERT_WEIGHTS - and weight.shape != torch.Size([]) - and not name.startswith("_shared_experts.") - # exclude parameters from non-expert submodules, - # e.g. gate/shared/transforms. - and not name.startswith("_gate.") - and not name.startswith("_routed_input_transform.") - and not name.startswith("_routed_output_transform.") + if name not in NON_EXPERT_WEIGHTS and weight.shape != torch.Size([]) ] def set_eplb_state( @@ -1499,17 +1478,17 @@ def set_eplb_state( self.eplb_state.logical_replica_count = logical_replica_count[moe_layer_idx] def ensure_moe_quant_config_init(self): - if self.quant_method.moe_quant_config is None: + if self.runner.quant_method.moe_quant_config is None: # Note: the moe_quant_config can't be constructed until after # weight loading post processing. - self.quant_method.moe_quant_config = ( - self.quant_method.get_fused_moe_quant_config(self) + self.runner.quant_method.moe_quant_config = ( + self.runner.quant_method.get_fused_moe_quant_config(self) ) - @property - def moe_quant_config(self) -> FusedMoEQuantConfig | None: - self.ensure_moe_quant_config_init() - return self.quant_method.moe_quant_config + # @property + # def moe_quant_config(self) -> FusedMoEQuantConfig | None: + # self.ensure_moe_quant_config_init() + # return self.runner.quant_method.moe_quant_config def forward_native( self, diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index b6441ae38e5a..805eeb266068 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -18,9 +18,6 @@ FusedMoERouter, ) from vllm.model_executor.layers.fused_moe.runner.moe_runner_base import MoERunnerBase -from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( - SharedExperts, -) logger = init_logger(__name__) @@ -59,7 +56,7 @@ def __init__( router: FusedMoERouter, routed_input_transform: torch.nn.Module | None, gate: torch.nn.Module | None, - shared_experts: SharedExperts | None, + shared_experts: torch.nn.Module | None, quant_method: FusedMoEMethodBase, enable_dbo: bool, routed_output_transform: torch.nn.Module | None = None, diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index 856ae969214d..b7bd4520b3e2 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -4,6 +4,13 @@ import torch +from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( + FusedMoEMethodBase, +) +from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( + SharedExperts, +) + class MoERunner(ABC): """ @@ -22,6 +29,21 @@ def forward( ) -> torch.Tensor: raise NotImplementedError + @property + @abstractmethod + def quant_method(self) -> FusedMoEMethodBase: + raise NotImplementedError + + @property + @abstractmethod + def shared_experts(self) -> SharedExperts | None: + raise NotImplementedError + + @abstractmethod + def _replace_quant_method(self, quant_method: FusedMoEMethodBase): + raise NotImplementedError + + @property @abstractmethod def is_internal_router(self) -> bool: raise NotImplementedError diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index ba7ba5806615..970f306801b2 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -191,7 +191,7 @@ def __init__( router: FusedMoERouter, routed_input_transform: torch.nn.Module | None, gate: torch.nn.Module | None, - shared_experts: SharedExperts | None, + shared_experts: torch.nn.Module | None, quant_method: FusedMoEMethodBase, enable_dbo: bool, routed_output_transform: torch.nn.Module | None = None, @@ -203,8 +203,12 @@ def __init__( self.router = router self.routed_input_transform = routed_input_transform self.gate = gate - self.shared_experts = shared_experts - self.quant_method = quant_method + self._shared_experts = SharedExperts( + shared_experts, + moe_config=moe_config, + mk_owns_shared_expert=quant_method.mk_owns_shared_expert, # ? + ) + self._quant_method = quant_method self.enable_dbo = enable_dbo self.enable_eplb = moe_config.moe_parallel_config.enable_eplb self.routed_output_transform = routed_output_transform @@ -218,6 +222,26 @@ def __init__( self._forward_entry = self._select_forward(layer) + @property + def is_internal_router(self) -> bool: + return self.gate is not None + + @property + def quant_method(self) -> FusedMoEMethodBase: + return self._quant_method + + @property + def shared_experts(self) -> SharedExperts | None: + return self._shared_experts + + # TODO(bnell): Temporary hack. Get rid of this. + def _replace_quant_method(self, quant_method: FusedMoEMethodBase): + self._quant_method = quant_method + if self._shared_experts is not None: + self._shared_experts._mk_owns_shared_expert = ( + quant_method.mk_owns_shared_expert + ) + def _select_forward(self, layer: torch.nn.Module) -> Callable: if current_platform.is_tpu() or current_platform.is_cpu(): # TODO: Once the OOM issue for the TPU backend is resolved, we diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py index c9a7efc431c3..4813929aeaa5 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py @@ -19,9 +19,6 @@ DefaultMoERunner, ) from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner -from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( - SharedExperts, -) def create_moe_runner( @@ -30,7 +27,7 @@ def create_moe_runner( router: FusedMoERouter, routed_input_transform: torch.nn.Module | None, gate: torch.nn.Module | None, - shared_experts: SharedExperts | None, + shared_experts: torch.nn.Module | None, quant_method: FusedMoEMethodBase, enable_dbo: bool, routed_output_transform: torch.nn.Module | None = None, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index e07989734c47..198f58ad3ea4 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -832,7 +832,9 @@ def _setup_kernel( replace_parameter(layer, f"w2_{self.weight_scale_name}", w2_scale) self.moe_quant_config = self.get_fused_moe_quant_config(layer) + print("XXXXXXXXXXXXXXXXXXXXXXXXXX") if self.moe_quant_config: + print("YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY") assert self.experts_cls is not None self.moe_kernel = make_fp8_moe_kernel( moe_quant_config=self.moe_quant_config, @@ -844,9 +846,12 @@ def _setup_kernel( ) def process_weights_after_loading(self, layer: Module) -> None: + print("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") if getattr(layer, "_already_called_process_weights_after_loading", False): return + print("BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB") + # Allow for accessing weights and scales in standard way. w13 = layer.w13_weight w2 = layer.w2_weight diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index c0d5849a2921..b50236c6b461 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -338,7 +338,7 @@ def __init__( # NOTE(dbari): Use BF16 if routing is not Deepseek, e.g. Mistral Large 3 self.gate.set_out_dtype( torch.float32 - if self.experts.quant_method.is_monolithic + if self.experts.is_monolithic and self.experts.routing_method_type == RoutingMethodType.DeepSeekV3 else torch.bfloat16 ) From 46101e7013f1ad1ff27a9fe93a61ec946a9fd11f Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 23 Mar 2026 23:18:59 +0000 Subject: [PATCH 073/191] hack fix for weight loading Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 12 +++++------- vllm/model_executor/layers/quantization/fp8.py | 5 ----- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index dcc13f76031c..fdd0836be4a0 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -576,6 +576,9 @@ def __init__( quant_config, self.moe_config, ) + self.quant_method = ( + quant_method # only for weight loading. how to get around this? + ) if not self.moe_config.is_act_and_mul and not current_platform.is_cuda_alike(): raise NotImplementedError( @@ -677,13 +680,6 @@ def _replace_quant_method(self, mk: FusedMoEMethodBase): # This is called after all weight loading and post-processing, so it # should be safe to swap out the quant_method. def maybe_init_modular_kernel(self) -> None: - print( - f"GOT HERE {self.runner.quant_method} " - "{self.runner.quant_method.moe_kernel} " - "{self.runner.quant_method.supports_internal_mk} " - "{self.runner.quant_method.is_monolithic}" - ) - # NOTE(rob): WIP refactor. For quant methods that own the MK # we create the MK during process_weights_after_loading. if ( @@ -1436,6 +1432,8 @@ def _maybe_make_contiguous( ) weights = list(self.named_parameters()) + # This doesn't work + # weights = weights + list(self.runner.quant_method.named_parameters()) weights = [(name, _maybe_make_contiguous(name, p)) for name, p in weights] # `w13_input_scale` and `w2_input_scale` are global per-tensor diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 198f58ad3ea4..e07989734c47 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -832,9 +832,7 @@ def _setup_kernel( replace_parameter(layer, f"w2_{self.weight_scale_name}", w2_scale) self.moe_quant_config = self.get_fused_moe_quant_config(layer) - print("XXXXXXXXXXXXXXXXXXXXXXXXXX") if self.moe_quant_config: - print("YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY") assert self.experts_cls is not None self.moe_kernel = make_fp8_moe_kernel( moe_quant_config=self.moe_quant_config, @@ -846,12 +844,9 @@ def _setup_kernel( ) def process_weights_after_loading(self, layer: Module) -> None: - print("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") if getattr(layer, "_already_called_process_weights_after_loading", False): return - print("BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB") - # Allow for accessing weights and scales in standard way. w13 = layer.w13_weight w2 = layer.w2_weight From 6b9a89aff4067eb0a8fec9525fd525c260b325a9 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 24 Mar 2026 02:37:44 +0000 Subject: [PATCH 074/191] make ctors more independent Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 99 ++++++++----------- .../fused_moe/runner/moe_runner_base.py | 4 +- 2 files changed, 43 insertions(+), 60 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index fdd0836be4a0..d6a7ac462e78 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -317,14 +317,11 @@ def __init__( ): super().__init__() - self._apply_scale_to_output = apply_scale_to_output - if params_dtype is None: params_dtype = torch.get_default_dtype() self.params_dtype = params_dtype vllm_config = get_current_vllm_config() - self.vllm_config = vllm_config # FIXME (varun): We should have a better way of inferring the activation # datatype. This works for now as the tensor datatype entering the MoE @@ -481,14 +478,15 @@ def __init__( self.e_score_correction_bias = e_score_correction_bias # TODO(bnell): end attributes + # Store in runner? self.apply_router_weight_on_input = apply_router_weight_on_input self.activation = MoEActivation.from_str(activation) - self.runner: MoERunner + self._runner: MoERunner # TODO(bnell): we should not have to create a router if the kernel is # monolithic. - self.router = create_fused_moe_router( + router = create_fused_moe_router( top_k=top_k, global_num_experts=self.global_num_experts, eplb_state=self.eplb_state, @@ -506,11 +504,11 @@ def __init__( enable_eplb=enable_eplb, # TODO(bnell): once we can construct the MK at init time, we # can make this a value. - indices_type_getter=lambda: self.runner.quant_method.topk_indices_dtype, + indices_type_getter=lambda: self._runner.quant_method.topk_indices_dtype, zero_expert_type=zero_expert_type, num_logical_experts=self.logical_num_experts, ) - self.routing_method_type: RoutingMethodType = self.router.routing_method_type + self.routing_method_type: RoutingMethodType = router.routing_method_type # When using zero experts, slice e_score_correction_bias to cover # only real experts, for compatibility with monolithic kernels that @@ -524,8 +522,8 @@ def __init__( # This way moe_config is created with the correct hidden_size from the start. unpadded_hidden_size = hidden_size self.model_type = ( - self.vllm_config.model_config.hf_config.model_type - if self.vllm_config.model_config is not None + vllm_config.model_config.hf_config.model_type + if vllm_config.model_config is not None else None ) hidden_size = maybe_roundup_hidden_size( @@ -616,12 +614,21 @@ def __init__( quant_method.create_weights(layer=self, **moe_quant_params) - self.runner = self._init_runner( - quant_method=quant_method, - gate=gate, - shared_experts=shared_experts, + # Storing the runner in the FusedMoE is an intermediate state, eventually + # the runner will own the FusedMoE layer and provide the execution interface + # for MoE ops. + self._runner = create_moe_runner( + layer=self, + moe_config=self.moe_config, + router=router, routed_input_transform=routed_input_transform, routed_output_transform=routed_output_transform, + gate=gate, + shared_experts=shared_experts, + quant_method=quant_method, + enable_dbo=vllm_config.parallel_config.enable_dbo, + apply_scale_to_output=apply_scale_to_output, + routed_scaling_factor=routed_scaling_factor, ) def _get_quant_method( @@ -642,38 +649,12 @@ def _get_quant_method( assert isinstance(quant_method, FusedMoEMethodBase) return quant_method - def _init_runner( - self, - quant_method: FusedMoEMethodBase, - gate: torch.nn.Module | None, - shared_experts: torch.nn.Module | None, - routed_input_transform: torch.nn.Module | None = None, - routed_output_transform: torch.nn.Module | None = None, - ) -> MoERunner: - # Storing the runner in the FusedMoE is an intermediate state, eventually - # the runner will own the FusedMoE layer and provide the execution interface - # for MoE ops. - self._init_shared_experts() - return create_moe_runner( - layer=self, - moe_config=self.moe_config, - router=self.router, - routed_input_transform=routed_input_transform, - routed_output_transform=routed_output_transform, - gate=gate, - shared_experts=shared_experts, - quant_method=quant_method, - enable_dbo=self.vllm_config.parallel_config.enable_dbo, - apply_scale_to_output=self._apply_scale_to_output, - routed_scaling_factor=self.routed_scaling_factor, - ) - # TODO(bnell): This method is provided as a hook so vllm/lora/layers/fused_moe.py # and vllm/distributed/elastic_ep/elastic_execute.py # can safely swap out the quant_method. We should figure out a less # intrusive way to do this. def _replace_quant_method(self, mk: FusedMoEMethodBase): - self.runner._replace_quant_method(mk) + self._runner._replace_quant_method(mk) # Note: maybe_init_modular_kernel should only be called by # prepare_communication_buffer_for_model. @@ -683,8 +664,8 @@ def maybe_init_modular_kernel(self) -> None: # NOTE(rob): WIP refactor. For quant methods that own the MK # we create the MK during process_weights_after_loading. if ( - self.runner.quant_method.supports_internal_mk - or self.runner.quant_method.is_monolithic + self._runner.quant_method.supports_internal_mk + or self._runner.quant_method.is_monolithic ): return None @@ -693,10 +674,10 @@ def maybe_init_modular_kernel(self) -> None: # DeepEP all2all backend. routing_tables = self._maybe_init_expert_routing_tables() - if isinstance(self.runner.quant_method, FusedMoEModularMethod): - base_quant_method = self.runner.quant_method.old_quant_method + if isinstance(self._runner.quant_method, FusedMoEModularMethod): + base_quant_method = self._runner.quant_method.old_quant_method else: - base_quant_method = self.runner.quant_method + base_quant_method = self._runner.quant_method prepare_finalize = base_quant_method.maybe_make_prepare_finalize( routing_tables=routing_tables @@ -745,15 +726,15 @@ def use_ep(self): @property def is_internal_router(self) -> bool: # By default, router/gate is called before FusedMoE forward pass - return self.runner.is_internal_router + return self._runner.is_internal_router @property def is_monolithic(self) -> bool: - return self.runner.quant_method.is_monolithic + return self._runner.quant_method.is_monolithic @property def shared_experts(self) -> SharedExperts | None: - return self.runner.shared_experts + return self._runner.shared_experts def _maybe_init_expert_routing_tables( self, @@ -1090,12 +1071,12 @@ def weight_loader( param.data[:, :dim1, :dim2].copy_(loaded_weight) return True if return_success else None - quant_method_name = self.runner.quant_method.__class__.__name__ + quant_method_name = self._runner.quant_method.__class__.__name__ global_expert_id = expert_id expert_id = self._map_global_expert_id_to_local_expert_id(global_expert_id) use_global_sf = ( - getattr(self.runner.quant_method, "use_global_sf", False) + getattr(self._runner.quant_method, "use_global_sf", False) and "input_scale" in weight_name ) @@ -1110,7 +1091,7 @@ def weight_loader( is_transposed = getattr(param, "is_transposed", False) # compressed-tensors checkpoints with packed weights are stored flipped - # TODO (mgoin): check self.runner.quant_method.quant_config.quant_format + # TODO (mgoin): check self._runner.quant_method.quant_config.quant_format # against known CompressionFormat enum values that have this quality if quant_method_name in ( "CompressedTensorsWNA16MarlinMoEMethod", @@ -1217,7 +1198,9 @@ def weight_loader( if "ModelOpt" in quant_method_name: # Determine per-tensor weight scale patterns based on variant # Use the dedicated method instead of brittle string matching - uses_weight_scale_2 = self.runner.quant_method.uses_weight_scale_2_pattern() + uses_weight_scale_2 = ( + self._runner.quant_method.uses_weight_scale_2_pattern() + ) quant_method = getattr(param, "quant_method", None) # Call _load_per_tensor_weight_scale() to load per-tensor (scalar) @@ -1433,7 +1416,7 @@ def _maybe_make_contiguous( weights = list(self.named_parameters()) # This doesn't work - # weights = weights + list(self.runner.quant_method.named_parameters()) + # weights = weights + list(self._runner.quant_method.named_parameters()) weights = [(name, _maybe_make_contiguous(name, p)) for name, p in weights] # `w13_input_scale` and `w2_input_scale` are global per-tensor @@ -1476,24 +1459,24 @@ def set_eplb_state( self.eplb_state.logical_replica_count = logical_replica_count[moe_layer_idx] def ensure_moe_quant_config_init(self): - if self.runner.quant_method.moe_quant_config is None: + if self._runner.quant_method.moe_quant_config is None: # Note: the moe_quant_config can't be constructed until after # weight loading post processing. - self.runner.quant_method.moe_quant_config = ( - self.runner.quant_method.get_fused_moe_quant_config(self) + self._runner.quant_method.moe_quant_config = ( + self._runner.quant_method.get_fused_moe_quant_config(self) ) # @property # def moe_quant_config(self) -> FusedMoEQuantConfig | None: # self.ensure_moe_quant_config_init() - # return self.runner.quant_method.moe_quant_config + # return self._runner.quant_method.moe_quant_config def forward_native( self, hidden_states: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor: - return self.runner.forward( + return self._runner.forward( hidden_states, router_logits, ) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index 970f306801b2..47a19d9d8ec2 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -87,7 +87,7 @@ def _moe_forward( layer_name: _layer_name_type, ) -> torch.Tensor: layer = get_layer_from_name(_resolve_layer_name(layer_name)) - return layer.runner._forward_dispatch( + return layer._runner._forward_dispatch( layer, hidden_states, router_logits, @@ -111,7 +111,7 @@ def _moe_forward_shared( layer_name: _layer_name_type, ) -> tuple[torch.Tensor, torch.Tensor]: layer = get_layer_from_name(_resolve_layer_name(layer_name)) - return layer.runner._forward_dispatch( + return layer._runner._forward_dispatch( layer, hidden_states, router_logits, From 6d0ed3294517a6d8bf5fc60b7dd78f13ad8f1ac1 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 24 Mar 2026 03:09:46 +0000 Subject: [PATCH 075/191] make MoERunner into a torch.nn.Module Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 21 +++++++++++++++++-- .../fused_moe/runner/chunking_moe_runner.py | 14 +++++++------ .../layers/fused_moe/runner/moe_runner.py | 9 ++++++-- .../fused_moe/runner/moe_runner_factory.py | 15 ++++++++++++- .../model_executor/layers/quantization/fp8.py | 1 + 5 files changed, 49 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index d6a7ac462e78..0d440758fbe4 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1417,6 +1417,7 @@ def _maybe_make_contiguous( weights = list(self.named_parameters()) # This doesn't work # weights = weights + list(self._runner.quant_method.named_parameters()) + weights = [(name, _maybe_make_contiguous(name, p)) for name, p in weights] # `w13_input_scale` and `w2_input_scale` are global per-tensor @@ -1429,16 +1430,32 @@ def _maybe_make_contiguous( "w2_input_scale", } + # for name, weight in weights: + # print(f"NAME = {name}") + assert all( weight.is_contiguous() for name, weight in weights - if name not in NON_EXPERT_WEIGHTS + if not ( + "_shared_experts." in name + or "_gate." in name + or "_routed_input_transform." in name + or "_routed_output_transform." in name + ) + and name not in NON_EXPERT_WEIGHTS ) return [ weight.view(self.local_num_experts, -1) for name, weight in weights - if name not in NON_EXPERT_WEIGHTS and weight.shape != torch.Size([]) + if name not in NON_EXPERT_WEIGHTS + and weight.shape != torch.Size([]) + and "_shared_experts." not in name + # exclude parameters from non-expert submodules, + # e.g. gate/shared/transforms. + and "_gate." not in name + and "_routed_input_transform." not in name + and "_routed_output_transform." not in name ] def set_eplb_state( diff --git a/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py index 8aee95840cb5..3535fd01da7e 100644 --- a/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py @@ -34,7 +34,9 @@ class ChunkingMoERunner(MoERunnerBase): - Processes chunks via inner._forward_impl per chunk """ - def __init__(self, inner: MoERunnerBase): + def __init__(self, inner: MoERunnerBase, **kwargs): + super().__init__(**kwargs) + # Assert that _maybe_dispatch/_maybe_combine will be nops. assert inner.moe_config.pcp_size == 1 @@ -48,11 +50,11 @@ def __init__(self, inner: MoERunnerBase): self._init_dp_chunking() ) - def __getattr__(self, name): - # Delegate attribute access to the inner runner. This is only - # called when normal lookup (instance __dict__, class MRO) fails, - # so ChunkingMoERunner's own attributes and methods take priority. - return getattr(self._inner, name) + # def __getattr__(self, name): + # # Delegate attribute access to the inner runner. This is only + # # called when normal lookup (instance __dict__, class MRO) fails, + # # so ChunkingMoERunner's own attributes and methods take priority. + # return getattr(self._inner, name) def _init_dp_chunking(self) -> list[torch.Tensor]: states_shape: tuple[int, ...] diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index b7bd4520b3e2..16b5f540399a 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from abc import ABC, abstractmethod +from abc import abstractmethod import torch @@ -12,7 +12,7 @@ ) -class MoERunner(ABC): +class MoERunner(torch.nn.Module): """ Abstract base class for Mixture of Experts (MoE) runners. @@ -21,6 +21,11 @@ class MoERunner(ABC): expert routing, and managing tensor parallel operations. """ + def __init__(self): + super().__init__() + # HACK + self._already_called_process_weights_after_loading = True + @abstractmethod def forward( self, diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py index 4813929aeaa5..c31aec16653e 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py @@ -48,5 +48,18 @@ def create_moe_runner( routed_scaling_factor=routed_scaling_factor, ) if moe_config.moe_parallel_config.use_dp_chunking: - return ChunkingMoERunner(runner) + return ChunkingMoERunner( + inner=runner, + layer=layer, + moe_config=moe_config, + router=router, + routed_input_transform=routed_input_transform, + gate=gate, + shared_experts=shared_experts, + quant_method=quant_method, + enable_dbo=enable_dbo, + routed_output_transform=routed_output_transform, + apply_scale_to_output=apply_scale_to_output, + routed_scaling_factor=routed_scaling_factor, + ) return runner diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index e07989734c47..ceaad92e9cb1 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -844,6 +844,7 @@ def _setup_kernel( ) def process_weights_after_loading(self, layer: Module) -> None: + # print(f"LAYER {layer}") if getattr(layer, "_already_called_process_weights_after_loading", False): return From b122141de1a655ca69452705bd6c9758e95605d0 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 24 Mar 2026 19:30:51 +0000 Subject: [PATCH 076/191] separate out eplb + fix shared_experts Signed-off-by: Bill Nell --- .../layers/fused_moe/eplb_manager.py | 244 ++++++++++++++++++ vllm/model_executor/layers/fused_moe/layer.py | 163 +++--------- .../fused_moe/runner/chunking_moe_runner.py | 2 +- .../fused_moe/runner/moe_runner_base.py | 26 +- 4 files changed, 290 insertions(+), 145 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/eplb_manager.py diff --git a/vllm/model_executor/layers/fused_moe/eplb_manager.py b/vllm/model_executor/layers/fused_moe/eplb_manager.py new file mode 100644 index 000000000000..80547f53ee96 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/eplb_manager.py @@ -0,0 +1,244 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +EPLB (Expert Parallelism Load Balancing) Manager. + +This module provides the EplbManager class which encapsulates all EPLB-related +functionality for MoE layers, including state management, expert weight +collection, and expert parameter mapping. +""" + +from collections.abc import Iterable +from typing import TYPE_CHECKING + +import torch + +from vllm.distributed.eplb.eplb_state import EplbLayerState, EplbState + +if TYPE_CHECKING: + from vllm.model_executor.layers.fused_moe.layer import FusedMoE + + +class EplbManager: + """ + Manages Expert Parallelism Load Balancing (EPLB) state and operations + for a MoE layer. + + This class encapsulates all EPLB-related functionality including: + - Runtime state (expert load view, logical-to-physical mapping) + - Expert weight collection for load balancing + - Expert parameter mapping for weight loading with redundant experts + """ + + def __init__( + self, + num_redundant_experts: int = 0, + ): + self.num_redundant_experts = num_redundant_experts + + # Runtime EPLB state + self.state = EplbLayerState() + + def set_state( + self, + moe_layer_idx: int, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + """ + Register the EPLB state for this layer. + + This is used later in forward pass, where we get the expert mapping + and record the load metrics in `expert_load_view`. + + Args: + moe_layer_idx: Index of this MoE layer + expert_load_view: View into global expert load tracking tensor + logical_to_physical_map: Mapping from logical to physical expert IDs + logical_replica_count: Number of replicas for each logical expert + """ + self.state.expert_load_view = expert_load_view[moe_layer_idx] + self.state.logical_to_physical_map = logical_to_physical_map[moe_layer_idx] + self.state.logical_replica_count = logical_replica_count[moe_layer_idx] + + def get_expert_weights( + self, + layer: "FusedMoE", + ) -> Iterable[torch.Tensor]: + """ + Collect expert weights from the MoE layer for EPLB. + + Returns weights reshaped as (local_num_experts, -1) for efficient + expert weight swapping during load balancing. + + Args: + layer: The FusedMoE layer to collect weights from + + Returns: + Iterable of expert weight tensors + """ + + def _maybe_make_contiguous( + name: str, p: torch.nn.Parameter + ) -> torch.nn.Parameter: + """ + In some cases, the last 2 dimensions (the non-expert dimensions) + of the weight scale tensor are transposed. This function + transforms the tensor (view update) so the tensor is contiguous(). + Example: A non-contiguous scale tensor, + `x` of shape (E, 32, 16) and stride (512, 1, 32) is transformed to + `x_` of shape (E, 16, 32) and stride (512, 32, 1). + Note that we specifically use torch.transpose() so `x_` refers + to the same underlying memory. The tensors `x` and `x_`, pointing + to the same underlying memory make this transformation safe in the + context of EPLB. i.e. It is the same memory and just the view + is different. + Note: This function handles the "weight_scale" tensors specifically. + This could however be generalized to handle similar tensors. + """ + if p.ndim != 3: + return p + if p.is_contiguous(): + # Already contiguous. do nothing. + return p + # p is non-contiguous. We only handle the case where the last 2 + # dimensions of the scales tensor is transposed. We can handle + # other cases when they become relevant. + is_transposed_12 = p.stride(1) == 1 and p.stride(2) != 1 + if "weight_scale" not in name or not is_transposed_12: + # do nothing. + return p + + # Do not update the layer parameter as the layer's MoE operations would + # expect the parameter's tensor to the same shape / stride. Instead, + # make a new torch.nn.Parameter that is used just in the context of + # EPLB. + return torch.nn.Parameter( + torch.transpose(p.data, 1, 2), requires_grad=False + ) + + weights = list(layer.named_parameters()) + weights = [(name, _maybe_make_contiguous(name, p)) for name, p in weights] + + # `w13_input_scale` and `w2_input_scale` are global per-tensor + # activation scales shared across all experts (e.g. NVFP4). + # They are broadcast views (stride 0) from .expand() and are + # not actual expert weights, so exclude them from EPLB. + NON_EXPERT_WEIGHTS = { + "e_score_correction_bias", + "w13_input_scale", + "w2_input_scale", + } + + assert all( + weight.is_contiguous() + for name, weight in weights + if not ( + "_shared_experts." in name + or "_gate." in name + or "_routed_input_transform." in name + or "_routed_output_transform." in name + ) + and name not in NON_EXPERT_WEIGHTS + ) + + return [ + weight.view(layer.local_num_experts, -1) + for name, weight in weights + if name not in NON_EXPERT_WEIGHTS + and weight.shape != torch.Size([]) + and "_shared_experts." not in name + # exclude parameters from non-expert submodules, + # e.g. gate/shared/transforms. + and "_gate." not in name + and "_routed_input_transform." not in name + and "_routed_output_transform." not in name + ] + + @staticmethod + def make_expert_params_mapping( + model: torch.nn.Module, + ckpt_gate_proj_name: str, + ckpt_down_proj_name: str, + ckpt_up_proj_name: str, + num_experts: int, + num_redundant_experts: int = 0, + ) -> list[tuple[str, str, int, str]]: + """ + Create expert parameter mapping for weight loading with redundant experts. + + In the returned mapping: + - `expert_id` is the physical expert id + - `weight_name` contains the weight name of the logical expert + So that we map the expert id to logical in `weight_name` + + Args: + model: The model containing the MoE layer + ckpt_gate_proj_name: Checkpoint parameter name for gate projection + ckpt_down_proj_name: Checkpoint parameter name for down projection + ckpt_up_proj_name: Checkpoint parameter name for up projection + num_experts: Number of logical experts + num_redundant_experts: Number of redundant experts for EPLB + + Returns: + List of (param_name, weight_name, expert_id, shard_id) tuples + """ + num_physical_experts = num_experts + num_redundant_experts + + # Build initial physical-to-logical mapping + physical_to_logical_map = ( + EplbState.build_initial_global_physical_to_logical_map( + num_experts, num_redundant_experts + ) + ) + + base_layer = ( + "base_layer." + if any(".base_layer." in name for name, _ in model.named_parameters()) + else "" + ) + + return [ + # (param_name, weight_name, expert_id, shard_id) + ( + f"experts.{base_layer}w13_" + if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name] + else f"experts.{base_layer}w2_", + f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}", + expert_id, + shard_id, + ) + for expert_id in range(num_physical_experts) + for shard_id, weight_name in [ + ("w1", ckpt_gate_proj_name), + ("w2", ckpt_down_proj_name), + ("w3", ckpt_up_proj_name), + ] + ] + + @staticmethod + def validate_configuration( + enabled: bool, global_num_experts: int, ep_size: int + ) -> None: + """ + Validate EPLB configuration. + + Args: + enabled: Whether EPLB is enabled + global_num_experts: Total number of experts (including redundant) + ep_size: Expert parallelism size + + Raises: + AssertionError: If configuration is invalid + """ + if not enabled: + return + + # EPLB currently only supports even distribution of experts across ranks + assert global_num_experts % ep_size == 0, ( + f"EPLB currently only supports even distribution of " + f"experts across ranks. Got {global_num_experts} experts " + f"and {ep_size} EP ranks." + ) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 0d440758fbe4..0e4d3ec85faa 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -17,7 +17,6 @@ get_pcp_group, get_tensor_model_parallel_world_size, ) -from vllm.distributed.eplb.eplb_state import EplbLayerState, EplbState from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.fused_moe.activation import MoEActivation @@ -26,6 +25,7 @@ FusedMoEParallelConfig, RoutingMethodType, ) +from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, ) @@ -356,6 +356,12 @@ def __init__( self.global_num_experts = num_experts + num_redundant_experts self.logical_num_experts = num_experts + self.enable_eplb = enable_eplb + + # Initialize EPLB manager + self.eplb_manager = EplbManager( + num_redundant_experts=num_redundant_experts, + ) # Expert mapping used in self.load_weights self.expert_mapping = expert_mapping @@ -367,10 +373,6 @@ def __init__( compilation_config.static_forward_context[prefix] = self compilation_config.static_all_moe_layers.append(prefix) self.layer_name = prefix - - self.enable_eplb = enable_eplb - # TODO(bnell): should this be owned by router? - self.eplb_state = EplbLayerState() self.expert_placement_strategy: ExpertPlacementStrategy = ( vllm_config.parallel_config.expert_placement_strategy ) @@ -401,12 +403,12 @@ def __init__( # Determine expert maps if self.use_ep: - if self.enable_eplb: - assert self.global_num_experts % self.ep_size == 0, ( - "EPLB currently only supports even distribution of " - "experts across ranks." - ) - else: + # Validate EPLB configuration + self.eplb_manager.validate_configuration( + self.enable_eplb, self.global_num_experts, self.ep_size + ) + + if not self.enable_eplb: assert num_redundant_experts == 0, ( "Redundant experts are only supported with EPLB." ) @@ -489,7 +491,7 @@ def __init__( router = create_fused_moe_router( top_k=top_k, global_num_experts=self.global_num_experts, - eplb_state=self.eplb_state, + eplb_state=self.eplb_manager.state, renormalize=renormalize, use_grouped_topk=use_grouped_topk, num_expert_group=num_expert_group, @@ -501,7 +503,7 @@ def __init__( else 1.0, e_score_correction_bias=e_score_correction_bias, num_fused_shared_experts=self.num_fused_shared_experts, - enable_eplb=enable_eplb, + enable_eplb=self.enable_eplb, # TODO(bnell): once we can construct the MK at init time, we # can make this a value. indices_type_getter=lambda: self._runner.quant_method.topk_indices_dtype, @@ -1375,88 +1377,8 @@ def load_weights( yield param_name def get_expert_weights(self) -> Iterable[torch.Tensor]: - def _maybe_make_contiguous( - name: str, p: torch.nn.Parameter - ) -> torch.nn.Parameter: - """ - In some cases, the last 2 dimensions (the non-expert dimensions) - of the weight scale tensor are transposed. This function - transforms the tensor (view update) so the tensor is contiguous(). - Example: A non-contiguous scale tensor, - `x` of shape (E, 32, 16) and stride (512, 1, 32) is transformed to - `x_` of shape (E, 16, 32) and stride (512, 32, 1). - Note that we specifically use torch.transpose() so `x_` refers - to the same underlying memory. The tensors `x` and `x_`, pointing - to the same underlying memory make this transformation safe in the - context of EPLB. i.e. It is the same memory and just the view - is different. - Note: This function handles the "weight_scale" tensors specifically. - This could however be generalized to handle similar tensors. - """ - if p.ndim != 3: - return p - if p.is_contiguous(): - # Already contiguous. do nothing. - return p - # p is non-contiguous. We only handle the case where the last 2 - # dimensions of the scales tensor is transposed. We can handle - # other cases when they become relevant. - is_transposed_12 = p.stride(1) == 1 and p.stride(2) != 1 - if "weight_scale" not in name or not is_transposed_12: - # do nothing. - return p - - # Do not update the layer parameter as the layer's MoE operations would - # expect the parameter's tensor to the same shape / stride. Instead, - # make a new torch.nn.Parameter that is used just in the context of - # EPLB. - return torch.nn.Parameter( - torch.transpose(p.data, 1, 2), requires_grad=False - ) - - weights = list(self.named_parameters()) - # This doesn't work - # weights = weights + list(self._runner.quant_method.named_parameters()) - - weights = [(name, _maybe_make_contiguous(name, p)) for name, p in weights] - - # `w13_input_scale` and `w2_input_scale` are global per-tensor - # activation scales shared across all experts (e.g. NVFP4). - # They are broadcast views (stride 0) from .expand() and are - # not actual expert weights, so exclude them from EPLB. - NON_EXPERT_WEIGHTS = { - "e_score_correction_bias", - "w13_input_scale", - "w2_input_scale", - } - - # for name, weight in weights: - # print(f"NAME = {name}") - - assert all( - weight.is_contiguous() - for name, weight in weights - if not ( - "_shared_experts." in name - or "_gate." in name - or "_routed_input_transform." in name - or "_routed_output_transform." in name - ) - and name not in NON_EXPERT_WEIGHTS - ) - - return [ - weight.view(self.local_num_experts, -1) - for name, weight in weights - if name not in NON_EXPERT_WEIGHTS - and weight.shape != torch.Size([]) - and "_shared_experts." not in name - # exclude parameters from non-expert submodules, - # e.g. gate/shared/transforms. - and "_gate." not in name - and "_routed_input_transform." not in name - and "_routed_output_transform." not in name - ] + """Delegate to EPLB manager.""" + return self.eplb_manager.get_expert_weights(self) def set_eplb_state( self, @@ -1471,9 +1393,12 @@ def set_eplb_state( This is used later in forward pass, where we get the expert mapping and record the load metrics in `expert_load_view`. """ - self.eplb_state.expert_load_view = expert_load_view[moe_layer_idx] - self.eplb_state.logical_to_physical_map = logical_to_physical_map[moe_layer_idx] - self.eplb_state.logical_replica_count = logical_replica_count[moe_layer_idx] + self.eplb_manager.set_state( + moe_layer_idx, + expert_load_view, + logical_to_physical_map, + logical_replica_count, + ) def ensure_moe_quant_config_init(self): if self._runner.quant_method.moe_quant_config is None: @@ -1521,42 +1446,16 @@ def make_expert_params_mapping( num_experts: int, num_redundant_experts: int = 0, ) -> list[tuple[str, str, int, str]]: - num_physical_experts = num_experts + num_redundant_experts - - # In the returned mapping: - # - `expert_id` is the physical expert id - # - `weight_name` contains the weight name of the logical expert - # So that we should map the expert id to logical in `weight_name` - physical_to_logical_map = ( - EplbState.build_initial_global_physical_to_logical_map( - num_experts, num_redundant_experts - ) + """Delegate to EPLB manager.""" + return EplbManager.make_expert_params_mapping( + model, + ckpt_gate_proj_name, + ckpt_down_proj_name, + ckpt_up_proj_name, + num_experts, + num_redundant_experts, ) - base_layer = ( - "base_layer." - if any(".base_layer." in name for name, _ in model.named_parameters()) - else "" - ) - - return [ - # (param_name, weight_name, expert_id, shard_id) - ( - f"experts.{base_layer}w13_" - if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name] - else f"experts.{base_layer}w2_", - f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}", - expert_id, - shard_id, - ) - for expert_id in range(num_physical_experts) - for shard_id, weight_name in [ - ("w1", ckpt_gate_proj_name), - ("w2", ckpt_down_proj_name), - ("w3", ckpt_up_proj_name), - ] - ] - def extra_repr(self) -> str: s = ( f"global_num_experts={self.global_num_experts}, " diff --git a/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py index 3535fd01da7e..d4783e0572e6 100644 --- a/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py @@ -35,7 +35,7 @@ class ChunkingMoERunner(MoERunnerBase): """ def __init__(self, inner: MoERunnerBase, **kwargs): - super().__init__(**kwargs) + super().__init__(**kwargs) # this is not ideal # Assert that _maybe_dispatch/_maybe_combine will be nops. assert inner.moe_config.pcp_size == 1 diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index 47a19d9d8ec2..f05743491d67 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -203,11 +203,13 @@ def __init__( self.router = router self.routed_input_transform = routed_input_transform self.gate = gate - self._shared_experts = SharedExperts( - shared_experts, - moe_config=moe_config, - mk_owns_shared_expert=quant_method.mk_owns_shared_expert, # ? - ) + self._shared_experts: SharedExperts | None = None + if shared_experts is not None: + self._shared_experts = SharedExperts( + shared_experts, + moe_config=moe_config, + mk_owns_shared_expert=quant_method.mk_owns_shared_expert, # ? + ) self._quant_method = quant_method self.enable_dbo = enable_dbo self.enable_eplb = moe_config.moe_parallel_config.enable_eplb @@ -247,11 +249,11 @@ def _select_forward(self, layer: torch.nn.Module) -> Callable: # TODO: Once the OOM issue for the TPU backend is resolved, we # will switch to using the moe_forward custom op. # Note: CPU doesn't require wrapped _forward_impl. - return _moe_forward if self.shared_experts is None else _moe_forward_shared + return _moe_forward if self._shared_experts is None else _moe_forward_shared return ( torch.ops.vllm.moe_forward - if self.shared_experts is None + if self._shared_experts is None else torch.ops.vllm.moe_forward_shared ) @@ -280,7 +282,7 @@ def apply_routed_input_transform( return ( hidden_states, - hidden_states if self.shared_experts is not None else None, + hidden_states if self._shared_experts is not None else None, ) def apply_routed_output_transform( @@ -332,7 +334,7 @@ def _must_reduce_shared_expert_output(self) -> bool: early. """ return ( - self.shared_experts is not None + self._shared_experts is not None and self.quant_method.moe_kernel is not None and self.quant_method.moe_kernel.output_is_reduced() ) @@ -441,9 +443,9 @@ def _maybe_apply_shared_experts( model's overlap strategy. Only fires if shared experts are configured and the order matches the shared experts' configured execution point. """ - if self.shared_experts is not None: + if self._shared_experts is not None: assert shared_experts_input is not None - self.shared_experts.apply(shared_experts_input, order) + self._shared_experts.apply(shared_experts_input, order) def _apply_quant_method( self, @@ -490,7 +492,7 @@ def _apply_quant_method( ) return ( - self.shared_experts.output if self.shared_experts is not None else None, + self._shared_experts.output if self._shared_experts is not None else None, fused_out, ) From fd361fd7cef9ecad9a9945fcd3a8a789cda8d10a Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 24 Mar 2026 20:00:13 +0000 Subject: [PATCH 077/191] one eplb fix Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/eplb_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/eplb_manager.py b/vllm/model_executor/layers/fused_moe/eplb_manager.py index 80547f53ee96..b937c4d55ce1 100644 --- a/vllm/model_executor/layers/fused_moe/eplb_manager.py +++ b/vllm/model_executor/layers/fused_moe/eplb_manager.py @@ -145,7 +145,7 @@ def _maybe_make_contiguous( ) return [ - weight.view(layer.local_num_experts, -1) + weight.data.view(layer.local_num_experts, -1) for name, weight in weights if name not in NON_EXPERT_WEIGHTS and weight.shape != torch.Size([]) From 018195f2e72aed4b1e83d97969533680e8ba31a1 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 25 Mar 2026 04:22:08 +0000 Subject: [PATCH 078/191] fixes Signed-off-by: Bill Nell --- .../layers/fused_moe/eplb_manager.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/eplb_manager.py b/vllm/model_executor/layers/fused_moe/eplb_manager.py index b937c4d55ce1..ec54f27a3f1e 100644 --- a/vllm/model_executor/layers/fused_moe/eplb_manager.py +++ b/vllm/model_executor/layers/fused_moe/eplb_manager.py @@ -136,10 +136,10 @@ def _maybe_make_contiguous( weight.is_contiguous() for name, weight in weights if not ( - "_shared_experts." in name - or "_gate." in name - or "_routed_input_transform." in name - or "_routed_output_transform." in name + name.startswith("_shared_experts.") + or name.startswith("_gate.") + or name.startswith("_routed_input_transform.") + or name.startswith("_routed_output_transform.") ) and name not in NON_EXPERT_WEIGHTS ) @@ -149,12 +149,12 @@ def _maybe_make_contiguous( for name, weight in weights if name not in NON_EXPERT_WEIGHTS and weight.shape != torch.Size([]) - and "_shared_experts." not in name + and not name.startswith("_shared_experts.") # exclude parameters from non-expert submodules, # e.g. gate/shared/transforms. - and "_gate." not in name - and "_routed_input_transform." not in name - and "_routed_output_transform." not in name + and not name.startswith("_gate.") + and not name.startswith("_routed_input_transform.") + and not name.startswith("_routed_output_transform.") ] @staticmethod From 6664f6ce5ba6f8d0f0b3fbced119b16da0004608 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 25 Mar 2026 18:54:42 +0000 Subject: [PATCH 079/191] hack fixes for chunked runner Signed-off-by: Bill Nell --- vllm/lora/layers/fused_moe.py | 6 ++++-- vllm/model_executor/layers/fused_moe/layer.py | 6 +++--- .../layers/fused_moe/runner/chunking_moe_runner.py | 11 +++++++++++ .../layers/fused_moe/runner/shared_experts.py | 2 +- 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index fd9023b01fef..db6b9fed5cda 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -134,12 +134,14 @@ def _inject_lora_into_fused_moe(self): self.base_layer.ensure_moe_quant_config_init() quant_config = self.base_layer.quant_method.moe_quant_config - if getattr(self.base_layer.quant_method, "supports_internal_mk", False): + if getattr( + self.base_layer.quant_method, "supports_internal_mk", False + ): # XXXXXXXXXXX # Use the existing modular kernel from the quant method m_fused_moe_fn = self.base_layer.quant_method.moe_kernel # Don't let the kernel own shared experts so the runner can # overlap them with routed experts via a separate CUDA stream. - m_fused_moe_fn.shared_experts = None + m_fused_moe_fn.shared_experts = None # XXXXXXXXXXXXXXXXXXXXXXXXXXXX else: # Create a new modular kernel via select_gemm_impl. # Don't pass shared_experts to the kernel so the runner can diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 0e4d3ec85faa..d08f7f0be40e 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -576,9 +576,9 @@ def __init__( quant_config, self.moe_config, ) - self.quant_method = ( - quant_method # only for weight loading. how to get around this? - ) + + # TODO(bnell): only for weight loading. how to get around this? + self.quant_method = quant_method if not self.moe_config.is_act_and_mul and not current_platform.is_cuda_alike(): raise NotImplementedError( diff --git a/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py index d4783e0572e6..bdae155aab54 100644 --- a/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py @@ -7,6 +7,9 @@ get_forward_context, ) from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( + FusedMoEMethodBase, +) from vllm.model_executor.layers.fused_moe.runner.moe_runner_base import MoERunnerBase from vllm.utils.math_utils import cdiv from vllm.v1.worker.ubatching import dbo_current_ubatch_id @@ -37,6 +40,9 @@ class ChunkingMoERunner(MoERunnerBase): def __init__(self, inner: MoERunnerBase, **kwargs): super().__init__(**kwargs) # this is not ideal + # TODO(bnell): fix this + self._shared_experts = inner._shared_experts + # Assert that _maybe_dispatch/_maybe_combine will be nops. assert inner.moe_config.pcp_size == 1 @@ -56,6 +62,11 @@ def __init__(self, inner: MoERunnerBase, **kwargs): # # so ChunkingMoERunner's own attributes and methods take priority. # return getattr(self._inner, name) + def _replace_quant_method(self, quant_method: FusedMoEMethodBase): + self._quant_method = quant_method + self._inner._replace_quant_method(quant_method) + assert self._shared_experts == self._inner._shared_experts + def _init_dp_chunking(self) -> list[torch.Tensor]: states_shape: tuple[int, ...] logits_shape: tuple[int, ...] diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index b318e9eb3938..498fc43ede2e 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -87,7 +87,7 @@ def _determine_shared_experts_order( if self._has_external_experts and not self._use_dp_chunking: return SharedExpertsOrder.EXTERNAL - if self._quant_method.mk_owns_shared_expert: + if self._mk_owns_shared_expert: return SharedExpertsOrder.MK_INTERNAL_OVERLAPPED should_run_shared_in_aux_stream = ( From 0002540ca51cbfdaf47fac94d875123ca6f30608 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 25 Mar 2026 19:59:00 +0000 Subject: [PATCH 080/191] fix lora Signed-off-by: Bill Nell --- vllm/lora/layers/fused_moe.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index db6b9fed5cda..c7b2ac198bb8 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -134,14 +134,12 @@ def _inject_lora_into_fused_moe(self): self.base_layer.ensure_moe_quant_config_init() quant_config = self.base_layer.quant_method.moe_quant_config - if getattr( - self.base_layer.quant_method, "supports_internal_mk", False - ): # XXXXXXXXXXX + if getattr(self.base_layer.quant_method, "supports_internal_mk", False): # Use the existing modular kernel from the quant method m_fused_moe_fn = self.base_layer.quant_method.moe_kernel # Don't let the kernel own shared experts so the runner can # overlap them with routed experts via a separate CUDA stream. - m_fused_moe_fn.shared_experts = None # XXXXXXXXXXXXXXXXXXXXXXXXXXXX + m_fused_moe_fn.shared_experts = None else: # Create a new modular kernel via select_gemm_impl. # Don't pass shared_experts to the kernel so the runner can @@ -152,7 +150,6 @@ def _inject_lora_into_fused_moe(self): self.base_layer.quant_method.select_gemm_impl( prepare_finalize, self.base_layer ), - self.base_layer.shared_experts, ) # TODO: could be incorrect due to monolithic kernel? or add assert it From d9e94950e624795724455e96fe5085b747c654ec Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 25 Mar 2026 20:10:28 +0000 Subject: [PATCH 081/191] fix eplb bug Signed-off-by: Bill Nell --- .../layers/fused_moe/eplb_manager.py | 16 ++++++++-------- .../layers/fused_moe/runner/moe_runner_base.py | 5 +---- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/eplb_manager.py b/vllm/model_executor/layers/fused_moe/eplb_manager.py index ec54f27a3f1e..b03aaf3fc6d9 100644 --- a/vllm/model_executor/layers/fused_moe/eplb_manager.py +++ b/vllm/model_executor/layers/fused_moe/eplb_manager.py @@ -136,10 +136,10 @@ def _maybe_make_contiguous( weight.is_contiguous() for name, weight in weights if not ( - name.startswith("_shared_experts.") - or name.startswith("_gate.") - or name.startswith("_routed_input_transform.") - or name.startswith("_routed_output_transform.") + name.startswith("_runner._shared_experts.") # not correct? + or name.startswith("_runner.gate.") + or name.startswith("_runner.routed_input_transform.") + or name.startswith("_runner.routed_output_transform.") ) and name not in NON_EXPERT_WEIGHTS ) @@ -149,12 +149,12 @@ def _maybe_make_contiguous( for name, weight in weights if name not in NON_EXPERT_WEIGHTS and weight.shape != torch.Size([]) - and not name.startswith("_shared_experts.") + and not name.startswith("_runner._shared_experts.") # not correct? # exclude parameters from non-expert submodules, # e.g. gate/shared/transforms. - and not name.startswith("_gate.") - and not name.startswith("_routed_input_transform.") - and not name.startswith("_routed_output_transform.") + and not name.startswith("_runner.gate.") + and not name.startswith("_runner.routed_input_transform.") + and not name.startswith("_runner.routed_output_transform.") ] @staticmethod diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index f05743491d67..fa98c90650c1 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -202,6 +202,7 @@ def __init__( self.moe_config = moe_config self.router = router self.routed_input_transform = routed_input_transform + self.routed_output_transform = routed_output_transform self.gate = gate self._shared_experts: SharedExperts | None = None if shared_experts is not None: @@ -213,7 +214,6 @@ def __init__( self._quant_method = quant_method self.enable_dbo = enable_dbo self.enable_eplb = moe_config.moe_parallel_config.enable_eplb - self.routed_output_transform = routed_output_transform self.apply_scale_to_output = ( apply_scale_to_output and routed_scaling_factor != 1.0 ) @@ -257,9 +257,6 @@ def _select_forward(self, layer: torch.nn.Module) -> Callable: else torch.ops.vllm.moe_forward_shared ) - def is_internal_router(self) -> bool: - return self.gate is not None - def apply_routed_input_transform( self, hidden_states: torch.Tensor ) -> tuple[torch.Tensor, torch.Tensor | None]: From 4270ba7cce10459938e7e4116f816a777f749dd5 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 26 Mar 2026 00:38:22 +0000 Subject: [PATCH 082/191] turn SharedExperts into a torch.nn.Module Signed-off-by: Bill Nell --- tests/kernels/moe/test_moe_layer.py | 265 +++++++++--------- .../layers/fused_moe/eplb_manager.py | 4 +- vllm/model_executor/layers/fused_moe/layer.py | 25 +- .../layers/fused_moe/modular_kernel.py | 2 +- .../fused_moe/runner/moe_runner_base.py | 2 +- .../layers/fused_moe/runner/shared_experts.py | 9 +- 6 files changed, 157 insertions(+), 150 deletions(-) diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index e55bbe640af3..6850afd71b6a 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -1202,157 +1202,159 @@ def _run_one_config( via parallel_launch_with_config, passing either _test_body_regular or _test_body_eplb as the test_body_fn parameter. """ - world_size = tp_size * dp_size - use_ep = ep_size > 1 - - assert vllm_config.parallel_config.enable_expert_parallel == use_ep - - in_dtype = torch.bfloat16 - device = torch.accelerator.current_accelerator() - - if not is_workspace_manager_initialized(): - init_workspace_manager(device) - - # Create test data and transforms - test_data = setup_moe_test_data( - m=m, - k=k, - n=n, - num_experts=num_experts, - in_dtype=in_dtype, - use_shared_experts=use_shared_experts, - use_gate=use_gate, - use_routed_input_transform=use_routed_input_transform, - backend=backend, - device=device, - ) - - # Extract data from test_data - hidden_states = test_data.hidden_states - router_logits = test_data.router_logits - w1 = test_data.w1 - w2 = test_data.w2 - shared_experts_config = test_data.shared_experts_config - gate = test_data.gate - routed_input_transform = test_data.routed_input_transform - routed_output_transform = test_data.routed_output_transform - - baseline_layer = make_fake_moe_layer( - w1=w1, - w2=w2, - top_k=top_k, - global_num_experts=num_experts, - in_dtype=in_dtype, - quant_dtype=None, # quantization_to_quant_dtype(quantization), - renormalize=False, - shared_experts_config=shared_experts_config, - gate=gate, - routed_input_transform=routed_input_transform, - routed_output_transform=routed_output_transform, - use_ep=use_ep, - tp_size=tp_size, - ep_size=ep_size, - dp_size=dp_size, - ) + try: + world_size = tp_size * dp_size + use_ep = ep_size > 1 - baseline_output = baseline_layer(hidden_states, router_logits) + assert vllm_config.parallel_config.enable_expert_parallel == use_ep - with set_current_vllm_config(vllm_config): - # Chunk weights for EP/TP (after baseline is created) - if ep_size > 1: - w1 = chunk_by_rank(w1, dp_rank, dp_size, dim=0, device=device) - w2 = chunk_by_rank(w2, dp_rank, dp_size, dim=0, device=device) + in_dtype = torch.bfloat16 + device = torch.accelerator.current_accelerator() - if tp_size > 1: - w1 = tp_chunk_gate_up(w1, tp_rank, tp_size, dim=1, device=device) - w2 = chunk_by_rank(w2, tp_rank, tp_size, dim=2, device=device) + if not is_workspace_manager_initialized(): + init_workspace_manager(device) - # Setup shared experts if needed - shared_experts = create_shared_experts_from_config( - shared_experts_config, in_dtype, tp_size, tp_rank, device + # Create test data and transforms + test_data = setup_moe_test_data( + m=m, + k=k, + n=n, + num_experts=num_experts, + in_dtype=in_dtype, + use_shared_experts=use_shared_experts, + use_gate=use_gate, + use_routed_input_transform=use_routed_input_transform, + backend=backend, + device=device, ) - # Determine hidden size for MoE layer - # When using routed_input_transform, experts operate in latent space - hidden_size_for_layer = k // 2 if routed_input_transform is not None else k - - # Create initial MoE layer - moe_layer = make_fused_moe_layer( - quantization=quantization, - use_ep=use_ep, - hidden_size=hidden_size_for_layer, - intermediate_size=n, - in_dtype=in_dtype, - tp_size=tp_size, - ep_size=ep_size, - dp_size=dp_size, + # Extract data from test_data + hidden_states = test_data.hidden_states + router_logits = test_data.router_logits + w1 = test_data.w1 + w2 = test_data.w2 + shared_experts_config = test_data.shared_experts_config + gate = test_data.gate + routed_input_transform = test_data.routed_input_transform + routed_output_transform = test_data.routed_output_transform + + baseline_layer = make_fake_moe_layer( w1=w1, w2=w2, top_k=top_k, global_num_experts=num_experts, - shared_experts=shared_experts, + in_dtype=in_dtype, + quant_dtype=None, # quantization_to_quant_dtype(quantization), + renormalize=False, + shared_experts_config=shared_experts_config, gate=gate, routed_input_transform=routed_input_transform, routed_output_transform=routed_output_transform, - ) - - # Necessary? - if moe_layer._expert_map is not None: - moe_layer._expert_map = moe_layer._expert_map.to(device) - - num_tokens = m - num_tokens_across_dp = torch.tensor( - [num_tokens] * world_size, - device=device, - dtype=torch.int, - ) - - # Call the test body function with all necessary context - expected, actual = test_body_fn( - moe_fn=moe_layer, - moe_layer=moe_layer, - hidden_states=hidden_states, - router_logits=router_logits, - vllm_config=vllm_config, - num_tokens=num_tokens, - num_tokens_across_dp=num_tokens_across_dp, - in_dtype=in_dtype, - quantization=quantization, use_ep=use_ep, tp_size=tp_size, ep_size=ep_size, dp_size=dp_size, - w1=w1, - w2=w2, - num_experts=num_experts, - k=k, - n=n, - m=m, - top_k=top_k, - shared_experts=shared_experts, - gate=gate, - routed_input_transform=routed_input_transform, - routed_output_transform=routed_output_transform, - baseline_output=baseline_output, - **kwargs, ) - # Common tolerance logic - # TODO: consider associating tolerances with quant methods. - if quantization is None: - if k >= 2048: - atol, rtol = 6.5e-2, 6.5e-2 + baseline_output = baseline_layer(hidden_states, router_logits) + + with set_current_vllm_config(vllm_config): + # Chunk weights for EP/TP (after baseline is created) + if ep_size > 1: + w1 = chunk_by_rank(w1, dp_rank, dp_size, dim=0, device=device) + w2 = chunk_by_rank(w2, dp_rank, dp_size, dim=0, device=device) + + if tp_size > 1: + w1 = tp_chunk_gate_up(w1, tp_rank, tp_size, dim=1, device=device) + w2 = chunk_by_rank(w2, tp_rank, tp_size, dim=2, device=device) + + # Setup shared experts if needed + shared_experts = create_shared_experts_from_config( + shared_experts_config, in_dtype, tp_size, tp_rank, device + ) + + # Determine hidden size for MoE layer + # When using routed_input_transform, experts operate in latent space + hidden_size_for_layer = k // 2 if routed_input_transform is not None else k + + # Create initial MoE layer + moe_layer = make_fused_moe_layer( + quantization=quantization, + use_ep=use_ep, + hidden_size=hidden_size_for_layer, + intermediate_size=n, + in_dtype=in_dtype, + tp_size=tp_size, + ep_size=ep_size, + dp_size=dp_size, + w1=w1, + w2=w2, + top_k=top_k, + global_num_experts=num_experts, + shared_experts=shared_experts, + gate=gate, + routed_input_transform=routed_input_transform, + routed_output_transform=routed_output_transform, + ) + + # Necessary? + if moe_layer._expert_map is not None: + moe_layer._expert_map = moe_layer._expert_map.to(device) + + num_tokens = m + num_tokens_across_dp = torch.tensor( + [num_tokens] * world_size, + device=device, + dtype=torch.int, + ) + + # Call the test body function with all necessary context + expected, actual = test_body_fn( + moe_fn=moe_layer, + moe_layer=moe_layer, + hidden_states=hidden_states, + router_logits=router_logits, + vllm_config=vllm_config, + num_tokens=num_tokens, + num_tokens_across_dp=num_tokens_across_dp, + in_dtype=in_dtype, + quantization=quantization, + use_ep=use_ep, + tp_size=tp_size, + ep_size=ep_size, + dp_size=dp_size, + w1=w1, + w2=w2, + num_experts=num_experts, + k=k, + n=n, + m=m, + top_k=top_k, + shared_experts=shared_experts, + gate=gate, + routed_input_transform=routed_input_transform, + routed_output_transform=routed_output_transform, + baseline_output=baseline_output, + **kwargs, + ) + + # Common tolerance logic + # TODO: consider associating tolerances with quant methods. + if quantization is None: + if k >= 2048: + atol, rtol = 6.5e-2, 6.5e-2 + else: + atol, rtol = 3.5e-2, 3.5e-2 + elif quantization in ("fp8", "modelopt_fp8"): + atol, rtol = 6e-2, 6e-2 + elif quantization == "modelopt_fp4": + atol = rtol = 1e-1 + k * 5e-4 else: - atol, rtol = 3.5e-2, 3.5e-2 - elif quantization in ("fp8", "modelopt_fp8"): - atol, rtol = 6e-2, 6e-2 - elif quantization == "modelopt_fp4": - atol = rtol = 1e-1 + k * 5e-4 - else: - atol, rtol = 6e-2, 6e-2 + atol, rtol = 6e-2, 6e-2 - torch.accelerator.synchronize() # TODO: Is this needed? - torch.testing.assert_close(expected, actual, atol=atol, rtol=rtol) + torch.testing.assert_close(expected, actual, atol=atol, rtol=rtol) + finally: + torch.accelerator.synchronize() # Test for non-parallel cases (world_size == 1) - backend doesn't matter @@ -1612,5 +1614,4 @@ def test_moe_layer( else: pytest.skip("No valid test configs for current parallel config.") finally: - torch.accelerator.synchronize() # TODO: Is this needed? - torch.accelerator.empty_cache() + torch.accelerator.synchronize() diff --git a/vllm/model_executor/layers/fused_moe/eplb_manager.py b/vllm/model_executor/layers/fused_moe/eplb_manager.py index b03aaf3fc6d9..f947786ee74a 100644 --- a/vllm/model_executor/layers/fused_moe/eplb_manager.py +++ b/vllm/model_executor/layers/fused_moe/eplb_manager.py @@ -136,7 +136,7 @@ def _maybe_make_contiguous( weight.is_contiguous() for name, weight in weights if not ( - name.startswith("_runner._shared_experts.") # not correct? + name.startswith("_runner._shared_experts._layer") or name.startswith("_runner.gate.") or name.startswith("_runner.routed_input_transform.") or name.startswith("_runner.routed_output_transform.") @@ -149,7 +149,7 @@ def _maybe_make_contiguous( for name, weight in weights if name not in NON_EXPERT_WEIGHTS and weight.shape != torch.Size([]) - and not name.startswith("_runner._shared_experts.") # not correct? + and not name.startswith("_runner._shared_experts._layer") # exclude parameters from non-expert submodules, # e.g. gate/shared/transforms. and not name.startswith("_runner.gate.") diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index d08f7f0be40e..9b8aae518c09 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -434,18 +434,19 @@ def __init__( self.register_buffer("_expert_map", expert_map) self.register_buffer("expert_mask", expert_mask) self._maybe_init_expert_routing_tables() - logger.info_once( - "[EP Rank %s/%s] Expert parallelism is enabled. Expert " - "placement strategy: %s. Local/global" - " number of experts: %s/%s. Experts local to global index map:" - " %s.", - self.ep_rank, - self.ep_size, - self.expert_placement_strategy, - self.local_num_experts, - self.global_num_experts, - get_compressed_expert_map(self._expert_map), - ) + if False: + logger.info_once( + "[EP Rank %s/%s] Expert parallelism is enabled. Expert " + "placement strategy: %s. Local/global" + " number of experts: %s/%s. Experts local to global index map:" + " %s.", + self.ep_rank, + self.ep_size, + self.expert_placement_strategy, + self.local_num_experts, + self.global_num_experts, + get_compressed_expert_map(self._expert_map), + ) else: self.local_num_experts, self._expert_map, self.expert_mask = ( self.global_num_experts, diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index f51c7fa53b43..84eafde51d78 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -1084,7 +1084,7 @@ def _maybe_apply_shared_experts( ): if self.shared_experts is not None: assert shared_experts_input is not None - self.shared_experts.apply( + self.shared_experts( shared_experts_input, SharedExpertsOrder.MK_INTERNAL_OVERLAPPED, ) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index fa98c90650c1..b1bf3838418f 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -442,7 +442,7 @@ def _maybe_apply_shared_experts( """ if self._shared_experts is not None: assert shared_experts_input is not None - self._shared_experts.apply(shared_experts_input, order) + self._shared_experts(shared_experts_input, order) def _apply_quant_method( self, diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index 498fc43ede2e..c1aceacf2efa 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -37,13 +37,15 @@ class SharedExpertsOrder(IntEnum): MULTI_STREAM_OVERLAPPED = (4,) -class SharedExperts: +class SharedExperts(torch.nn.Module): def __init__( self, layer: torch.nn.Module, moe_config: FusedMoEConfig, mk_owns_shared_expert: bool, ): + super().__init__() + self._output: torch.Tensor | None = None self._layer = layer self._moe_config = moe_config @@ -151,7 +153,7 @@ def output(self) -> torch.Tensor: self._output = None return output - def apply( + def forward( self, shared_experts_input: torch.Tensor, order: SharedExpertsOrder, @@ -167,3 +169,6 @@ def apply( self._output = self._run_in_aux_stream(shared_experts_input) else: self._output = self._layer(shared_experts_input) + + # def __call__(self, *args, **kwargs): + # return self.forward(*args, **kwargs) From b807df1817633f181aa983e3831acd0a01638e40 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 26 Mar 2026 02:49:22 +0000 Subject: [PATCH 083/191] move eplb state to router Signed-off-by: Bill Nell --- tests/kernels/moe/test_moe_layer.py | 6 ++- .../layers/fused_moe/eplb_manager.py | 6 +-- vllm/model_executor/layers/fused_moe/layer.py | 50 +++++++++---------- .../layers/fused_moe/router/base_router.py | 11 ++-- .../fused_moe/router/custom_routing_router.py | 6 +-- .../fused_moe/router/fused_moe_router.py | 6 +++ .../router/fused_topk_bias_router.py | 6 +-- .../fused_moe/router/fused_topk_router.py | 6 +-- .../fused_moe/router/grouped_topk_router.py | 6 +-- .../fused_moe/router/memoizing_router.py | 35 ------------- .../layers/fused_moe/router/router_factory.py | 22 ++++---- .../router/routing_simulator_router.py | 6 +-- .../fused_moe/router/zero_expert_router.py | 6 +-- 13 files changed, 72 insertions(+), 100 deletions(-) delete mode 100644 vllm/model_executor/layers/fused_moe/router/memoizing_router.py diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index 6850afd71b6a..654c0fb23b84 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -34,6 +34,7 @@ from vllm.model_executor.layers.fused_moe import FusedMoE, SharedFusedMoE, fused_experts from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager from vllm.model_executor.layers.fused_moe.router.router_factory import ( create_fused_moe_router, ) @@ -920,6 +921,7 @@ def make_fake_moe_layer( expert_load_view: torch.Tensor | None = None, logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, + num_redundant_experts: int = 0, gate: torch.nn.Module | None = None, routed_input_transform: torch.nn.Module | None = None, routed_output_transform: torch.nn.Module | None = None, @@ -930,10 +932,12 @@ def make_fake_moe_layer( ) -> Callable: activation = MoEActivation.from_str(activation) + eplb_manager = EplbManager(num_redundant_experts=num_redundant_experts) + router = create_fused_moe_router( top_k=top_k, global_num_experts=global_num_experts, - # eplb_state=None, # TODO + eplb_manager=eplb_manager, renormalize=renormalize, use_grouped_topk=use_grouped_topk, num_expert_group=num_expert_group, diff --git a/vllm/model_executor/layers/fused_moe/eplb_manager.py b/vllm/model_executor/layers/fused_moe/eplb_manager.py index f947786ee74a..f48a0e418b62 100644 --- a/vllm/model_executor/layers/fused_moe/eplb_manager.py +++ b/vllm/model_executor/layers/fused_moe/eplb_manager.py @@ -10,15 +10,11 @@ """ from collections.abc import Iterable -from typing import TYPE_CHECKING import torch from vllm.distributed.eplb.eplb_state import EplbLayerState, EplbState -if TYPE_CHECKING: - from vllm.model_executor.layers.fused_moe.layer import FusedMoE - class EplbManager: """ @@ -65,7 +61,7 @@ def set_state( def get_expert_weights( self, - layer: "FusedMoE", + layer: torch.nn.Module, # FusedMoE ) -> Iterable[torch.Tensor]: """ Collect expert weights from the MoE layer for EPLB. diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 9b8aae518c09..c0eff6980169 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -356,12 +356,9 @@ def __init__( self.global_num_experts = num_experts + num_redundant_experts self.logical_num_experts = num_experts - self.enable_eplb = enable_eplb - # Initialize EPLB manager - self.eplb_manager = EplbManager( - num_redundant_experts=num_redundant_experts, - ) + # Initialize EPLB manager (or None?) + eplb_manager = EplbManager(num_redundant_experts=num_redundant_experts) # Expert mapping used in self.load_weights self.expert_mapping = expert_mapping @@ -404,11 +401,11 @@ def __init__( # Determine expert maps if self.use_ep: # Validate EPLB configuration - self.eplb_manager.validate_configuration( - self.enable_eplb, self.global_num_experts, self.ep_size + eplb_manager.validate_configuration( + enable_eplb, self.global_num_experts, self.ep_size ) - if not self.enable_eplb: + if not enable_eplb: assert num_redundant_experts == 0, ( "Redundant experts are only supported with EPLB." ) @@ -418,7 +415,7 @@ def __init__( moe_parallel_config=self.moe_parallel_config, num_expert_group=num_expert_group, num_redundant_experts=num_redundant_experts, - enable_eplb=self.enable_eplb, + enable_eplb=enable_eplb, ) self._expert_map: torch.Tensor | None @@ -434,19 +431,18 @@ def __init__( self.register_buffer("_expert_map", expert_map) self.register_buffer("expert_mask", expert_mask) self._maybe_init_expert_routing_tables() - if False: - logger.info_once( - "[EP Rank %s/%s] Expert parallelism is enabled. Expert " - "placement strategy: %s. Local/global" - " number of experts: %s/%s. Experts local to global index map:" - " %s.", - self.ep_rank, - self.ep_size, - self.expert_placement_strategy, - self.local_num_experts, - self.global_num_experts, - get_compressed_expert_map(self._expert_map), - ) + logger.info_once( + "[EP Rank %s/%s] Expert parallelism is enabled. Expert " + "placement strategy: %s. Local/global" + " number of experts: %s/%s. Experts local to global index map:" + " %s.", + self.ep_rank, + self.ep_size, + self.expert_placement_strategy, + self.local_num_experts, + self.global_num_experts, + get_compressed_expert_map(self._expert_map), + ) else: self.local_num_experts, self._expert_map, self.expert_mask = ( self.global_num_experts, @@ -492,7 +488,7 @@ def __init__( router = create_fused_moe_router( top_k=top_k, global_num_experts=self.global_num_experts, - eplb_state=self.eplb_manager.state, + eplb_manager=eplb_manager, renormalize=renormalize, use_grouped_topk=use_grouped_topk, num_expert_group=num_expert_group, @@ -504,7 +500,7 @@ def __init__( else 1.0, e_score_correction_bias=e_score_correction_bias, num_fused_shared_experts=self.num_fused_shared_experts, - enable_eplb=self.enable_eplb, + enable_eplb=enable_eplb, # TODO(bnell): once we can construct the MK at init time, we # can make this a value. indices_type_getter=lambda: self._runner.quant_method.topk_indices_dtype, @@ -586,7 +582,7 @@ def __init__( "is_act_and_mul=False is supported only for CUDA and ROCm for now" ) - if self.enable_eplb and not quant_method.supports_eplb: + if enable_eplb and not quant_method.supports_eplb: # TODO: Add support for additional quantization methods. # The implementation for other quantization methods does not # contain essential differences, but the current quant API @@ -1379,7 +1375,7 @@ def load_weights( def get_expert_weights(self) -> Iterable[torch.Tensor]: """Delegate to EPLB manager.""" - return self.eplb_manager.get_expert_weights(self) + return self._runner.router.eplb_manager.get_expert_weights(self) def set_eplb_state( self, @@ -1394,7 +1390,7 @@ def set_eplb_state( This is used later in forward pass, where we get the expert mapping and record the load metrics in `expert_load_view`. """ - self.eplb_manager.set_state( + self._runner.router.eplb_manager.set_state( moe_layer_idx, expert_load_view, logical_to_physical_map, diff --git a/vllm/model_executor/layers/fused_moe/router/base_router.py b/vllm/model_executor/layers/fused_moe/router/base_router.py index 6332827d1d09..25d7a46fcfc9 100644 --- a/vllm/model_executor/layers/fused_moe/router/base_router.py +++ b/vllm/model_executor/layers/fused_moe/router/base_router.py @@ -5,7 +5,7 @@ import torch -from vllm.distributed.eplb.eplb_state import EplbLayerState +from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( FusedMoERouter, ) @@ -109,7 +109,7 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_state: EplbLayerState, + eplb_manager: EplbManager, enable_eplb: bool = False, # TODO(bnell): Once the MK is constructed at layer init time, we # can make this a plain value instead of a callback. @@ -124,11 +124,16 @@ def __init__( super().__init__() self.top_k = top_k self.global_num_experts = global_num_experts - self.eplb_state = eplb_state + self._eplb_manager = eplb_manager + self.eplb_state = eplb_manager.state self.enable_eplb = enable_eplb self.indices_type_getter = indices_type_getter self.capture_fn: Callable[[torch.Tensor], None] | None = None + @property + def eplb_manager(self) -> EplbManager: + return self._eplb_manager + def set_capture_fn(self, capture_fn: Callable[[torch.Tensor], None] | None) -> None: """Set a capture callback for logical routed expert IDs.""" self.capture_fn = capture_fn diff --git a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py index 0367189ca1ab..6baa56a05b89 100644 --- a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py +++ b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py @@ -4,8 +4,8 @@ import torch -from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.fused_moe.config import RoutingMethodType +from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter @@ -16,7 +16,7 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_state: EplbLayerState, + eplb_manager: EplbManager, custom_routing_function: Callable, renormalize: bool = True, enable_eplb: bool = False, @@ -25,7 +25,7 @@ def __init__( super().__init__( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, + eplb_manager=eplb_manager, enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, ) diff --git a/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py b/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py index d7aed4fdeb2b..0553bf63ecfa 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py @@ -6,6 +6,7 @@ import torch from vllm.model_executor.layers.fused_moe.config import RoutingMethodType +from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager class FusedMoERouter(ABC): @@ -26,6 +27,11 @@ def set_capture_fn( def routing_method_type(self) -> RoutingMethodType: raise NotImplementedError + @property + @abstractmethod + def eplb_manager(self) -> EplbManager: + raise NotImplementedError + @abstractmethod def select_experts( self, diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py index 5beb782d7386..dbc2cf4f89e2 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py @@ -7,7 +7,6 @@ import vllm._custom_ops as ops from vllm._aiter_ops import rocm_aiter_ops -from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) @@ -15,6 +14,7 @@ RoutingMethodType, get_routing_method_type, ) +from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter @@ -177,7 +177,7 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_state: EplbLayerState, + eplb_manager: EplbManager, e_score_correction_bias: torch.Tensor, scoring_func: str, renormalize: bool = True, @@ -188,7 +188,7 @@ def __init__( super().__init__( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, + eplb_manager=eplb_manager, enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, ) diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py index 01376e6b16b5..229f15dcdaaf 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py @@ -6,11 +6,11 @@ import vllm._custom_ops as ops from vllm._aiter_ops import rocm_aiter_ops -from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.fused_moe.config import ( RoutingMethodType, get_routing_method_type, ) +from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter @@ -120,7 +120,7 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_state: EplbLayerState, + eplb_manager: EplbManager, scoring_func: str = "softmax", renormalize: bool = True, enable_eplb: bool = False, @@ -129,7 +129,7 @@ def __init__( super().__init__( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, + eplb_manager=eplb_manager, enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, ) diff --git a/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py b/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py index 5af2e31b2320..70bb99801567 100644 --- a/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py +++ b/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py @@ -8,7 +8,6 @@ from vllm import _custom_ops as ops from vllm import envs as envs from vllm._aiter_ops import rocm_aiter_ops -from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, @@ -17,6 +16,7 @@ RoutingMethodType, get_routing_method_type, ) +from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( rocm_aiter_grouped_topk, ) @@ -254,7 +254,7 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_state: EplbLayerState, + eplb_manager: EplbManager, num_expert_group: int, topk_group: int, renormalize: bool = True, @@ -268,7 +268,7 @@ def __init__( super().__init__( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, + eplb_manager=eplb_manager, enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, ) diff --git a/vllm/model_executor/layers/fused_moe/router/memoizing_router.py b/vllm/model_executor/layers/fused_moe/router/memoizing_router.py deleted file mode 100644 index a55bd2f09d6f..000000000000 --- a/vllm/model_executor/layers/fused_moe/router/memoizing_router.py +++ /dev/null @@ -1,35 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable - -import torch - -from vllm.model_executor.layers.fused_moe.config import ( - RoutingMethodType, -) -from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter - - -class MemoizingRouter(FusedMoERouter): - def __init__(self, router: FusedMoERouter): - self.router = router - - def set_capture_fn( - self, - capture_fn: Callable[[torch.Tensor], None] | None, - ) -> None: - self.router.set_capture_fn(capture_fn) - self.results: tuple[torch.Tensor, torch.Tensor] | None = None - - @property - def routing_method_type(self) -> RoutingMethodType: - return self.router.routing_method_type - - def select_experts( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - if self.results is None: - self.results = self.router.select_experts(hidden_states, router_logits) - return self.results diff --git a/vllm/model_executor/layers/fused_moe/router/router_factory.py b/vllm/model_executor/layers/fused_moe/router/router_factory.py index 42d418d7e537..32125db9310c 100644 --- a/vllm/model_executor/layers/fused_moe/router/router_factory.py +++ b/vllm/model_executor/layers/fused_moe/router/router_factory.py @@ -5,8 +5,8 @@ import torch import vllm.envs as envs -from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.fused_moe.config import RoutingMethodType +from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager from vllm.model_executor.layers.fused_moe.router.custom_routing_router import ( CustomRoutingRouter, ) @@ -29,8 +29,6 @@ ZeroExpertRouter, ) -EMPTY_EPLB_STATE: EplbLayerState = EplbLayerState() - def create_fused_moe_router( # common parameters @@ -51,7 +49,7 @@ def create_fused_moe_router( custom_routing_function: Callable | None = None, # eplb parameters enable_eplb: bool = False, - eplb_state: EplbLayerState = EMPTY_EPLB_STATE, + eplb_manager: EplbManager | None = None, # zero expert parameters zero_expert_type: str | None = None, num_logical_experts: int | None = None, @@ -91,7 +89,7 @@ def create_fused_moe_router( EPLB arguments: enable_eplb: Whether EPLB is enabled - eplb_state: EPLB (Expert Parallelism Load Balancing) state + eplb_manager: EPLB (Expert Parallelism Load Balancing) manager Zero expert arguments: zero_expert_type: Type of zero expert (e.g. identity). If not None, @@ -103,12 +101,14 @@ def create_fused_moe_router( An instance of the appropriate FusedMoERouter subclass """ + assert eplb_manager is not None, "eplb_manager is required" + routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY if routing_strategy != "": return RoutingSimulatorRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, + eplb_manager=eplb_manager, enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, ) @@ -123,7 +123,7 @@ def create_fused_moe_router( return ZeroExpertRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, + eplb_manager=eplb_manager, e_score_correction_bias=e_score_correction_bias, num_logical_experts=num_logical_experts, zero_expert_type=zero_expert_type, @@ -144,7 +144,7 @@ def create_fused_moe_router( grouped_topk_router = GroupedTopKRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, + eplb_manager=eplb_manager, num_expert_group=num_expert_group, topk_group=topk_group, renormalize=renormalize, @@ -172,7 +172,7 @@ def create_fused_moe_router( return CustomRoutingRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, + eplb_manager=eplb_manager, custom_routing_function=custom_routing_function, renormalize=renormalize, enable_eplb=enable_eplb, @@ -183,7 +183,7 @@ def create_fused_moe_router( return FusedTopKBiasRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, + eplb_manager=eplb_manager, e_score_correction_bias=e_score_correction_bias, scoring_func=scoring_func, renormalize=renormalize, @@ -195,7 +195,7 @@ def create_fused_moe_router( return FusedTopKRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, + eplb_manager=eplb_manager, renormalize=renormalize, scoring_func=scoring_func, enable_eplb=enable_eplb, diff --git a/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py b/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py index f8e46371841a..4abd0a83b728 100644 --- a/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py +++ b/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py @@ -7,9 +7,9 @@ import torch import vllm.envs as envs -from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import RoutingMethodType +from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter logger = init_logger(__name__) @@ -313,14 +313,14 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_state: EplbLayerState, + eplb_manager: EplbManager, enable_eplb: bool = False, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, + eplb_manager=eplb_manager, enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, ) diff --git a/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py b/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py index c87070bc5acf..6ac671b3c566 100644 --- a/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py +++ b/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py @@ -5,11 +5,11 @@ import torch -from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.fused_moe.config import ( RoutingMethodType, get_routing_method_type, ) +from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager from vllm.model_executor.layers.fused_moe.fused_moe import ( zero_experts_compute_triton, ) @@ -32,7 +32,7 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_state: EplbLayerState, + eplb_manager: EplbManager, e_score_correction_bias: torch.Tensor, num_logical_experts: int, zero_expert_type: str, @@ -45,7 +45,7 @@ def __init__( super().__init__( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, + eplb_manager=eplb_manager, enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, ) From 19c9fa313674f008fd104552503dd17a1a0e9be5 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 26 Mar 2026 03:14:15 +0000 Subject: [PATCH 084/191] simplify enable_eplb flag Signed-off-by: Bill Nell --- .../modular_kernel_tools/parallel_utils.py | 1 + tests/kernels/moe/test_moe_layer.py | 7 ++-- .../layers/fused_moe/eplb_manager.py | 6 +-- vllm/model_executor/layers/fused_moe/layer.py | 41 +++++++++++-------- .../layers/fused_moe/router/base_router.py | 33 ++++++++------- .../fused_moe/router/custom_routing_router.py | 4 +- .../fused_moe/router/fused_moe_router.py | 2 +- .../router/fused_topk_bias_router.py | 4 +- .../fused_moe/router/fused_topk_router.py | 4 +- .../fused_moe/router/grouped_topk_router.py | 4 +- .../layers/fused_moe/router/router_factory.py | 20 +++------ .../router/routing_simulator_router.py | 4 +- .../fused_moe/router/zero_expert_router.py | 4 +- 13 files changed, 58 insertions(+), 76 deletions(-) diff --git a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py index 10a226bcd977..a998d386c6f7 100644 --- a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py +++ b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py @@ -116,6 +116,7 @@ def _worker_parallel_launch( traceback.print_exc() raise finally: + torch.accelerator.synchronize() if vllm_config is not None: cleanup_dist_env_and_memory() else: diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index 654c0fb23b84..889ea9362878 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -932,12 +932,13 @@ def make_fake_moe_layer( ) -> Callable: activation = MoEActivation.from_str(activation) - eplb_manager = EplbManager(num_redundant_experts=num_redundant_experts) + eplb_manager: EplbManager | None = None + if enable_eplb: + eplb_manager = EplbManager(num_redundant_experts=num_redundant_experts) router = create_fused_moe_router( top_k=top_k, global_num_experts=global_num_experts, - eplb_manager=eplb_manager, renormalize=renormalize, use_grouped_topk=use_grouped_topk, num_expert_group=num_expert_group, @@ -947,7 +948,7 @@ def make_fake_moe_layer( routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, num_fused_shared_experts=0, # TODO - enable_eplb=enable_eplb, + eplb_manager=eplb_manager, # TODO(bnell): once we can construct the MK at init time, we # can make this a value. indices_type_getter=lambda: indices_type, diff --git a/vllm/model_executor/layers/fused_moe/eplb_manager.py b/vllm/model_executor/layers/fused_moe/eplb_manager.py index f48a0e418b62..6d43554ba7ea 100644 --- a/vllm/model_executor/layers/fused_moe/eplb_manager.py +++ b/vllm/model_executor/layers/fused_moe/eplb_manager.py @@ -216,21 +216,19 @@ def make_expert_params_mapping( @staticmethod def validate_configuration( - enabled: bool, global_num_experts: int, ep_size: int + global_num_experts: int, + ep_size: int, ) -> None: """ Validate EPLB configuration. Args: - enabled: Whether EPLB is enabled global_num_experts: Total number of experts (including redundant) ep_size: Expert parallelism size Raises: AssertionError: If configuration is invalid """ - if not enabled: - return # EPLB currently only supports even distribution of experts across ranks assert global_num_experts % ep_size == 0, ( diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index c0eff6980169..c0ad296466e6 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -358,7 +358,9 @@ def __init__( self.logical_num_experts = num_experts # Initialize EPLB manager (or None?) - eplb_manager = EplbManager(num_redundant_experts=num_redundant_experts) + eplb_manager: EplbManager | None = None + if enable_eplb: + eplb_manager = EplbManager(num_redundant_experts=num_redundant_experts) # Expert mapping used in self.load_weights self.expert_mapping = expert_mapping @@ -400,12 +402,12 @@ def __init__( # Determine expert maps if self.use_ep: - # Validate EPLB configuration - eplb_manager.validate_configuration( - enable_eplb, self.global_num_experts, self.ep_size - ) - - if not enable_eplb: + if eplb_manager is not None: + # Validate EPLB configuration + eplb_manager.validate_configuration( + self.global_num_experts, self.ep_size + ) + else: assert num_redundant_experts == 0, ( "Redundant experts are only supported with EPLB." ) @@ -415,7 +417,7 @@ def __init__( moe_parallel_config=self.moe_parallel_config, num_expert_group=num_expert_group, num_redundant_experts=num_redundant_experts, - enable_eplb=enable_eplb, + enable_eplb=eplb_manager is not None, ) self._expert_map: torch.Tensor | None @@ -488,7 +490,6 @@ def __init__( router = create_fused_moe_router( top_k=top_k, global_num_experts=self.global_num_experts, - eplb_manager=eplb_manager, renormalize=renormalize, use_grouped_topk=use_grouped_topk, num_expert_group=num_expert_group, @@ -500,7 +501,7 @@ def __init__( else 1.0, e_score_correction_bias=e_score_correction_bias, num_fused_shared_experts=self.num_fused_shared_experts, - enable_eplb=enable_eplb, + eplb_manager=eplb_manager, # TODO(bnell): once we can construct the MK at init time, we # can make this a value. indices_type_getter=lambda: self._runner.quant_method.topk_indices_dtype, @@ -582,7 +583,7 @@ def __init__( "is_act_and_mul=False is supported only for CUDA and ROCm for now" ) - if enable_eplb and not quant_method.supports_eplb: + if eplb_manager is not None and not quant_method.supports_eplb: # TODO: Add support for additional quantization methods. # The implementation for other quantization methods does not # contain essential differences, but the current quant API @@ -1375,7 +1376,10 @@ def load_weights( def get_expert_weights(self) -> Iterable[torch.Tensor]: """Delegate to EPLB manager.""" - return self._runner.router.eplb_manager.get_expert_weights(self) + if self._runner.router.eplb_manager is not None: + return self._runner.router.eplb_manager.get_expert_weights(self) + else: + return [] def set_eplb_state( self, @@ -1390,12 +1394,13 @@ def set_eplb_state( This is used later in forward pass, where we get the expert mapping and record the load metrics in `expert_load_view`. """ - self._runner.router.eplb_manager.set_state( - moe_layer_idx, - expert_load_view, - logical_to_physical_map, - logical_replica_count, - ) + if self._runner.router.eplb_manager is not None: + self._runner.router.eplb_manager.set_state( + moe_layer_idx, + expert_load_view, + logical_to_physical_map, + logical_replica_count, + ) def ensure_moe_quant_config_init(self): if self._runner.quant_method.moe_quant_config is None: diff --git a/vllm/model_executor/layers/fused_moe/router/base_router.py b/vllm/model_executor/layers/fused_moe/router/base_router.py index 25d7a46fcfc9..605cf2e1546b 100644 --- a/vllm/model_executor/layers/fused_moe/router/base_router.py +++ b/vllm/model_executor/layers/fused_moe/router/base_router.py @@ -109,8 +109,7 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_manager: EplbManager, - enable_eplb: bool = False, + eplb_manager: EplbManager | None = None, # TODO(bnell): Once the MK is constructed at layer init time, we # can make this a plain value instead of a callback. indices_type_getter: Callable[[], torch.dtype | None] | None = None, @@ -125,13 +124,11 @@ def __init__( self.top_k = top_k self.global_num_experts = global_num_experts self._eplb_manager = eplb_manager - self.eplb_state = eplb_manager.state - self.enable_eplb = enable_eplb self.indices_type_getter = indices_type_getter self.capture_fn: Callable[[torch.Tensor], None] | None = None @property - def eplb_manager(self) -> EplbManager: + def eplb_manager(self) -> EplbManager | None: return self._eplb_manager def set_capture_fn(self, capture_fn: Callable[[torch.Tensor], None] | None) -> None: @@ -139,15 +136,16 @@ def set_capture_fn(self, capture_fn: Callable[[torch.Tensor], None] | None) -> N self.capture_fn = capture_fn def _validate_eplb_state(self) -> None: - """Validate that EPLB state is properly initialized if EPLB is enabled.""" - if self.enable_eplb: - if self.eplb_state.expert_load_view is None: + if self.eplb_manager is not None: + eplb_state = self.eplb_manager.state + """Validate that EPLB state is properly initialized if EPLB is enabled.""" + if eplb_state.expert_load_view is None: raise ValueError("enable_eplb=True requires expert_load_view != None") - if self.eplb_state.logical_to_physical_map is None: + if eplb_state.logical_to_physical_map is None: raise ValueError( "enable_eplb=True requires logical_to_physical_map != None" ) - if self.eplb_state.logical_replica_count is None: + if eplb_state.logical_replica_count is None: raise ValueError( "enable_eplb=True requires logical_replica_count != None" ) @@ -160,15 +158,16 @@ def _get_indices_type(self) -> torch.dtype | None: def _apply_eplb_mapping(self, topk_ids: torch.Tensor) -> torch.Tensor: """Apply EPLB mapping to convert logical expert IDs to physical expert IDs.""" - if self.enable_eplb: - assert self.eplb_state.expert_load_view is not None - assert self.eplb_state.logical_to_physical_map is not None - assert self.eplb_state.logical_replica_count is not None + if self.eplb_manager is not None: + eplb_state = self.eplb_manager.state + assert eplb_state.expert_load_view is not None + assert eplb_state.logical_to_physical_map is not None + assert eplb_state.logical_replica_count is not None return eplb_map_to_physical_and_record( topk_ids=topk_ids, - expert_load_view=self.eplb_state.expert_load_view, - logical_to_physical_map=self.eplb_state.logical_to_physical_map, - logical_replica_count=self.eplb_state.logical_replica_count, + expert_load_view=eplb_state.expert_load_view, + logical_to_physical_map=eplb_state.logical_to_physical_map, + logical_replica_count=eplb_state.logical_replica_count, ) return topk_ids diff --git a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py index 6baa56a05b89..5cf7061fbf37 100644 --- a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py +++ b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py @@ -16,17 +16,15 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_manager: EplbManager, custom_routing_function: Callable, + eplb_manager: EplbManager | None = None, renormalize: bool = True, - enable_eplb: bool = False, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, eplb_manager=eplb_manager, - enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, ) self.custom_routing_function = custom_routing_function diff --git a/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py b/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py index 0553bf63ecfa..e8151069a4cb 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py @@ -29,7 +29,7 @@ def routing_method_type(self) -> RoutingMethodType: @property @abstractmethod - def eplb_manager(self) -> EplbManager: + def eplb_manager(self) -> EplbManager | None: raise NotImplementedError @abstractmethod diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py index dbc2cf4f89e2..f5071996a722 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py @@ -177,19 +177,17 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_manager: EplbManager, e_score_correction_bias: torch.Tensor, scoring_func: str, renormalize: bool = True, routed_scaling_factor: float = 1.0, - enable_eplb: bool = False, + eplb_manager: EplbManager | None = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, eplb_manager=eplb_manager, - enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, ) self.e_score_correction_bias = e_score_correction_bias diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py index 229f15dcdaaf..dc0390a0348c 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py @@ -120,17 +120,15 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_manager: EplbManager, scoring_func: str = "softmax", renormalize: bool = True, - enable_eplb: bool = False, + eplb_manager: EplbManager | None = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, eplb_manager=eplb_manager, - enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, ) self.renormalize = renormalize diff --git a/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py b/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py index 70bb99801567..a427f287c14a 100644 --- a/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py +++ b/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py @@ -254,7 +254,6 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_manager: EplbManager, num_expert_group: int, topk_group: int, renormalize: bool = True, @@ -262,14 +261,13 @@ def __init__( routed_scaling_factor: float = 1.0, e_score_correction_bias: torch.Tensor | None = None, num_fused_shared_experts: int = 0, - enable_eplb: bool = False, + eplb_manager: EplbManager | None = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, eplb_manager=eplb_manager, - enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, ) self.num_expert_group = num_expert_group diff --git a/vllm/model_executor/layers/fused_moe/router/router_factory.py b/vllm/model_executor/layers/fused_moe/router/router_factory.py index 32125db9310c..502891ea29d8 100644 --- a/vllm/model_executor/layers/fused_moe/router/router_factory.py +++ b/vllm/model_executor/layers/fused_moe/router/router_factory.py @@ -48,7 +48,6 @@ def create_fused_moe_router( # custom routing parameters custom_routing_function: Callable | None = None, # eplb parameters - enable_eplb: bool = False, eplb_manager: EplbManager | None = None, # zero expert parameters zero_expert_type: str | None = None, @@ -88,7 +87,6 @@ def create_fused_moe_router( custom_routing_function: Optional custom routing function EPLB arguments: - enable_eplb: Whether EPLB is enabled eplb_manager: EPLB (Expert Parallelism Load Balancing) manager Zero expert arguments: @@ -101,15 +99,12 @@ def create_fused_moe_router( An instance of the appropriate FusedMoERouter subclass """ - assert eplb_manager is not None, "eplb_manager is required" - routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY if routing_strategy != "": return RoutingSimulatorRouter( top_k=top_k, global_num_experts=global_num_experts, eplb_manager=eplb_manager, - enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, ) @@ -123,14 +118,13 @@ def create_fused_moe_router( return ZeroExpertRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_manager=eplb_manager, e_score_correction_bias=e_score_correction_bias, num_logical_experts=num_logical_experts, zero_expert_type=zero_expert_type, scoring_func=scoring_func, renormalize=renormalize, routed_scaling_factor=routed_scaling_factor, - enable_eplb=enable_eplb, + eplb_manager=eplb_manager, indices_type_getter=indices_type_getter, ) @@ -144,7 +138,6 @@ def create_fused_moe_router( grouped_topk_router = GroupedTopKRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_manager=eplb_manager, num_expert_group=num_expert_group, topk_group=topk_group, renormalize=renormalize, @@ -152,7 +145,7 @@ def create_fused_moe_router( routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, num_fused_shared_experts=num_fused_shared_experts, - enable_eplb=enable_eplb, + eplb_manager=eplb_manager, indices_type_getter=indices_type_getter, ) if ( @@ -172,10 +165,9 @@ def create_fused_moe_router( return CustomRoutingRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_manager=eplb_manager, custom_routing_function=custom_routing_function, renormalize=renormalize, - enable_eplb=enable_eplb, + eplb_manager=eplb_manager, indices_type_getter=indices_type_getter, ) @@ -183,21 +175,19 @@ def create_fused_moe_router( return FusedTopKBiasRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_manager=eplb_manager, e_score_correction_bias=e_score_correction_bias, scoring_func=scoring_func, renormalize=renormalize, routed_scaling_factor=routed_scaling_factor, - enable_eplb=enable_eplb, + eplb_manager=eplb_manager, indices_type_getter=indices_type_getter, ) return FusedTopKRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_manager=eplb_manager, renormalize=renormalize, scoring_func=scoring_func, - enable_eplb=enable_eplb, + eplb_manager=eplb_manager, indices_type_getter=indices_type_getter, ) diff --git a/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py b/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py index 4abd0a83b728..5d5ee87da28c 100644 --- a/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py +++ b/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py @@ -313,15 +313,13 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_manager: EplbManager, - enable_eplb: bool = False, + eplb_manager: EplbManager | None = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, eplb_manager=eplb_manager, - enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, ) diff --git a/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py b/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py index 6ac671b3c566..e202e324d1b4 100644 --- a/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py +++ b/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py @@ -32,21 +32,19 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_manager: EplbManager, e_score_correction_bias: torch.Tensor, num_logical_experts: int, zero_expert_type: str, scoring_func: str = "softmax", renormalize: bool = False, routed_scaling_factor: float = 1.0, - enable_eplb: bool = False, + eplb_manager: EplbManager | None = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, eplb_manager=eplb_manager, - enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, ) self.e_score_correction_bias = e_score_correction_bias From eaaceda561cdaccc1483e5363aaa3c634c4cec4b Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 26 Mar 2026 04:17:33 +0000 Subject: [PATCH 085/191] add ExpertMapManager Signed-off-by: Bill Nell --- tests/kernels/moe/test_moe_layer.py | 8 +- .../layers/fused_moe/expert_map_manager.py | 408 ++++++++++++++++++ vllm/model_executor/layers/fused_moe/layer.py | 293 +++++-------- 3 files changed, 513 insertions(+), 196 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/expert_map_manager.py diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index 889ea9362878..ed4caed2eab2 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -1126,8 +1126,8 @@ def _test_body_eplb( ) # Necessary? - if moe_layer._expert_map is not None: - moe_layer._expert_map = moe_layer._expert_map.to(device) + # if moe_layer._expert_map is not None: + # moe_layer._expert_map = moe_layer._expert_map.to(device) # All ranks must generate the same permutation initial_indices = torch.arange(num_experts, dtype=torch.long) @@ -1303,8 +1303,8 @@ def _run_one_config( ) # Necessary? - if moe_layer._expert_map is not None: - moe_layer._expert_map = moe_layer._expert_map.to(device) + # if moe_layer._expert_map is not None: + # moe_layer._expert_map = moe_layer._expert_map.to(device) num_tokens = m num_tokens_across_dp = torch.tensor( diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py new file mode 100644 index 000000000000..c4188db2dba3 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -0,0 +1,408 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Expert Map Manager for MoE layers. + +This module contains the ExpertMapManager class which manages expert ID +mappings and placement strategies for Expert Parallelism in MoE models. +""" + +import torch + +from vllm.config.parallel import ExpertPlacementStrategy +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig + +logger = init_logger(__name__) + + +def determine_expert_map( + ep_size: int, + ep_rank: int, + global_num_experts: int, + expert_placement_strategy: ExpertPlacementStrategy = "linear", + num_fused_shared_experts: int = 0, + return_expert_mask: bool = False, +) -> tuple[int, torch.Tensor | None, torch.Tensor | None]: + """ + Calculates how many experts should be assigned to each rank for EP and + creates a mapping from global to local expert index. Experts are + distributed evenly across ranks. Any remaining are assigned to the + last rank. + + Args: + ep_size: The size of the expert parallel group + ep_rank: The rank of the current process in the expert parallel + group + global_num_experts: The total number of experts in the model. + expert_placement_strategy: The expert placement strategy. + num_fused_shared_experts: Number of fused shared experts (for AITER) + return_expert_mask: Whether to return expert mask for AITER + + Returns: + tuple[int, Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple containing: + - local_num_experts (int): The number of experts assigned + to the current rank. + - expert_map (Optional[torch.Tensor]): A tensor of shape + (global_num_experts,) mapping from global to local index. + Contains -1 for experts not assigned to the current rank. + Returns None if ep_size is 1. + - expert_mask (Optional[torch.Tensor]): A tensor of shape + (global_num_experts + num_fused_shared_experts + 1,) + containing 1 for experts assigned to the current rank + and 0 for sentinel. + Returns None if ep_size is 1. + Used only when AITER MOE is enabled. + """ + from typing import get_args + + assert ep_size > 0 + if ep_size == 1: + return (global_num_experts, None, None) + + # Distribute experts as evenly as possible to each rank. + base_experts = global_num_experts // ep_size + remainder = global_num_experts % ep_size + local_num_experts = base_experts + 1 if ep_rank < remainder else base_experts + + # Create a tensor of size num_experts filled with -1 + expert_map = torch.full((global_num_experts,), -1, dtype=torch.int32) + # Create an expert map for the local experts + if expert_placement_strategy == "linear": + start_idx = ep_rank * base_experts + min(ep_rank, remainder) + expert_map[start_idx : start_idx + local_num_experts] = torch.arange( + 0, local_num_experts, dtype=torch.int32 + ) + elif expert_placement_strategy == "round_robin": + local_log_experts = torch.arange( + ep_rank, global_num_experts, ep_size, dtype=torch.int32 + ) + + expert_map[local_log_experts] = torch.arange( + 0, local_num_experts, dtype=torch.int32 + ) + else: + raise ValueError( + "Unsupported expert placement strategy " + f"'{expert_placement_strategy}', expected one of " + f"{get_args(ExpertPlacementStrategy)}" + ) + + expert_mask = None + if return_expert_mask: + expert_mask = torch.ones( + (global_num_experts + num_fused_shared_experts + 1,), dtype=torch.int32 + ) + expert_mask[-1] = 0 + expert_mask[:global_num_experts] = expert_map > -1 + expert_map = torch.cat( + ( + expert_map, + torch.tensor( + [local_num_experts + i for i in range(num_fused_shared_experts)], + dtype=torch.int32, + ), + ), + dim=0, + ) + + return (local_num_experts, expert_map, expert_mask) + + +class ExpertMapManager: + """ + Manages expert ID mappings and placement for Expert Parallelism. + + Responsibilities: + - Calculate local vs global expert counts + - Map between global, local, and physical expert IDs + - Manage placement strategies (linear, round_robin) + - Maintain routing tables for round-robin placement + - Support dynamic reconfiguration of EP topology + """ + + def __init__( + self, + global_num_experts: int, + logical_num_experts: int, + moe_parallel_config: FusedMoEParallelConfig, + placement_strategy: ExpertPlacementStrategy, + num_fused_shared_experts: int = 0, + rocm_aiter_enabled: bool = False, + device: torch.device | None = None, + ): + """ + Initialize expert map manager. + + Args: + global_num_experts: Total number of experts across all ranks + logical_num_experts: Number of logical (non-redundant) experts + moe_parallel_config: MoE parallel configuration (contains ep_size, + ep_rank, backend flags) + placement_strategy: Strategy for placing experts ('linear' or 'round_robin') + num_fused_shared_experts: Number of fused shared experts (for AITER) + rocm_aiter_enabled: Whether ROCm AITER fusion is enabled + device: Device for tensor allocations + """ + self.global_num_experts = global_num_experts + self.logical_num_experts = logical_num_experts + self.moe_parallel_config = moe_parallel_config + self.num_fused_shared_experts = num_fused_shared_experts + self.rocm_aiter_enabled = rocm_aiter_enabled + self.device = device + + # Determine effective placement strategy + self._placement_strategy = self._determine_placement_strategy( + placement_strategy + ) + + # Calculate expert mappings + self._calculate_expert_maps() + + # Initialize routing tables if needed + self._maybe_init_routing_tables() + + @property + def ep_size(self) -> int: + """Expert parallelism world size.""" + return self.moe_parallel_config.ep_size + + @property + def ep_rank(self) -> int: + """Expert parallelism rank.""" + return self.moe_parallel_config.ep_rank + + @property + def local_num_experts(self) -> int: + """Number of experts assigned to this rank.""" + return self._local_num_experts + + @property + def expert_map(self) -> torch.Tensor | None: + """ + Mapping from global expert ID to local expert ID. + + Returns tensor of shape (global_num_experts,) where: + - expert_map[global_id] = local_id if expert is on this rank + - expert_map[global_id] = -1 if expert is not on this rank + + Returns None if EP is not enabled (ep_size == 1). + """ + return self._expert_map + + @property + def expert_mask(self) -> torch.Tensor | None: + """ + Expert mask for AITER fusion (ROCm-specific). + + Returns tensor of shape (global_num_experts + num_fused_shared + 1,) + where 1 indicates expert is on this rank, 0 otherwise. + """ + return self._expert_mask + + @property + def placement_strategy(self) -> ExpertPlacementStrategy: + """Expert placement strategy ('linear' or 'round_robin').""" + return self._placement_strategy + + @property + def routing_tables( + self, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: + """ + Routing tables for round-robin placement. + + Returns (global_to_physical, physical_to_global, local_to_global) + or None if not using round-robin or tables not needed. + """ + if not hasattr(self, "_routing_tables"): + return None + return self._routing_tables + + def map_global_to_local(self, global_id: int) -> int: + """ + Map global expert ID to local expert ID. + + Args: + global_id: Global expert ID (0 to global_num_experts - 1) + + Returns: + Local expert ID (0 to local_num_experts - 1) + + Raises: + ValueError: If expert is not on this rank + """ + if self._expert_map is None: + return global_id + + local_id = self._expert_map[global_id].item() + if local_id == -1: + raise ValueError( + f"Expert {global_id} is not assigned to rank {self.ep_rank}" + ) + return local_id + + def is_local_expert(self, global_id: int) -> bool: + """Check if expert is assigned to this rank.""" + if self._expert_map is None: + return True + return self._expert_map[global_id] != -1 + + def get_local_expert_ids(self) -> list[int]: + """Get list of global IDs for experts on this rank.""" + if self._expert_map is None: + return list(range(self.global_num_experts)) + + return torch.where(self._expert_map != -1)[0].tolist() + + def update( + self, + new_ep_size: int | None = None, + new_ep_rank: int | None = None, + ) -> None: + """ + Update expert mappings for new EP configuration. + + Used during dynamic reconfiguration (e.g., elastic scaling). + + Args: + new_ep_size: New EP world size (if changed) + new_ep_rank: New EP rank (if changed) + """ + if new_ep_size is not None: + self.moe_parallel_config.ep_size = new_ep_size + if new_ep_rank is not None: + self.moe_parallel_config.ep_rank = new_ep_rank + + # Recalculate everything + self._placement_strategy = self._determine_placement_strategy( + self._placement_strategy + ) + self._calculate_expert_maps() + self._maybe_init_routing_tables() + + def get_compressed_map_string(self) -> str: + """ + Get compressed string representation of expert map for logging. + + Returns string mapping local to global expert IDs. + """ + if self._expert_map is None: + return f"[0..{self.global_num_experts - 1}]" + + global_indices = torch.where(self._expert_map != -1)[0] + local_indices = self._expert_map[global_indices] + return ", ".join( + f"{local_index.item()}->{global_index.item()}" + for local_index, global_index in zip(local_indices, global_indices) + ) + + # Private methods + + def _determine_placement_strategy( + self, requested_strategy: ExpertPlacementStrategy + ) -> ExpertPlacementStrategy: + """Determine effective placement strategy based on config.""" + if requested_strategy != "round_robin": + return requested_strategy + + # Round-robin requires specific conditions + if self.ep_size == 1: + return "linear" + + if ( + self.moe_parallel_config.use_all2all_kernels + and not self.moe_parallel_config.use_deepep_ll_kernels + and not self.moe_parallel_config.use_nixl_ep_kernels + ): + logger.warning( + "Round-robin placement requires DeepEP-ll or NIXL backend. " + "Falling back to linear." + ) + return "linear" + + return "round_robin" + + def _calculate_expert_maps(self) -> None: + """Calculate expert mappings based on placement strategy.""" + if self.ep_size == 1: + # No EP, all experts are local + self._local_num_experts = self.global_num_experts + self._expert_map = None + self._expert_mask = None + return + + # Call determine_expert_map with current config + ( + self._local_num_experts, + self._expert_map, + self._expert_mask, + ) = determine_expert_map( + ep_size=self.ep_size, + ep_rank=self.ep_rank, + global_num_experts=self.global_num_experts, + expert_placement_strategy=self._placement_strategy, + num_fused_shared_experts=self.num_fused_shared_experts, + return_expert_mask=self.rocm_aiter_enabled, + ) + + # Move to device if specified + if self.device is not None: + if self._expert_map is not None: + self._expert_map = self._expert_map.to(self.device) + if self._expert_mask is not None: + self._expert_mask = self._expert_mask.to(self.device) + + def _maybe_init_routing_tables(self) -> None: + """Initialize routing tables if needed for round-robin.""" + if self._placement_strategy != "round_robin": + return + + if ( + not self.moe_parallel_config.use_deepep_ll_kernels + and not self.moe_parallel_config.use_nixl_ep_kernels + ): + return + + if self._expert_map is None: + return + + self._routing_tables = self._build_routing_tables() + + def _build_routing_tables( + self, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Build routing tables for round-robin placement.""" + device_kwargs = {"device": self.device} if self.device is not None else {} + + global_indices = torch.arange( + self.global_num_experts, dtype=torch.long, **device_kwargs + ) + owner = torch.remainder(global_indices, self.ep_size) + local_index = torch.div(global_indices, self.ep_size, rounding_mode="floor") + + base = self.global_num_experts // self.ep_size + remainder = self.global_num_experts % self.ep_size + physical_offset = owner * base + + if remainder > 0: + remainder_tensor = torch.tensor( + remainder, dtype=torch.long, **device_kwargs + ) + physical_offset = physical_offset + torch.minimum(owner, remainder_tensor) + + global_to_physical = physical_offset + local_index + physical_to_global = torch.empty_like(global_to_physical) + physical_to_global[global_to_physical] = global_indices + + local_global = torch.arange( + self.ep_rank, + self.global_num_experts, + self.ep_size, + dtype=torch.long, + **device_kwargs, + ) + if local_global.numel() != self._local_num_experts: + local_global = local_global[: self._local_num_experts] + + return (global_to_physical, physical_to_global, local_global) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index c0ad296466e6..88da92271967 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -3,7 +3,7 @@ from collections.abc import Callable, Iterable from enum import Enum -from typing import Literal, cast, get_args, overload +from typing import Literal, overload import torch from torch.nn.parameter import UninitializedParameter @@ -26,6 +26,9 @@ RoutingMethodType, ) from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager +from vllm.model_executor.layers.fused_moe.expert_map_manager import ( + ExpertMapManager, +) from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, ) @@ -68,95 +71,7 @@ class FusedMoeWeightScaleSupported(Enum): BLOCK = "block" -def determine_expert_map( - ep_size: int, - ep_rank: int, - global_num_experts: int, - expert_placement_strategy: ExpertPlacementStrategy = "linear", - num_fused_shared_experts: int = 0, - return_expert_mask: bool = False, -) -> tuple[int, torch.Tensor | None, torch.Tensor | None]: - """ - Calculates how many experts should be assigned to each rank for EP and - creates a mapping from global to local expert index. Experts are - distributed evenly across ranks. Any remaining are assigned to the - last rank. - - Args: - ep_size: The size of the expert parallel group - ep_rank: The rank of the current process in the expert parallel - group - global_num_experts: The total number of experts in the model. - expert_placement_strategy: The expert placement strategy. - - Returns: - tuple[int, Optional[torch.Tensor]]: A tuple containing: - - local_num_experts (int): The number of experts assigned - to the current rank. - - expert_map (Optional[torch.Tensor]): A tensor of shape - (global_num_experts,) mapping from global to local index. - Contains -1 for experts not assigned to the current rank. - Returns None if ep_size is 1. - - expert_mask (Optional[torch.Tensor]): A tensor of shape - (global_num_experts + num_fused_shared_experts + 1,) - containing 1 for experts assigned to the current rank - and 0 for sentinel. - Returns None if ep_size is 1. - Used only when AITER MOE is enabled. - """ - assert ep_size > 0 - if ep_size == 1: - return (global_num_experts, None, None) - - # Distribute experts as evenly as possible to each rank. - base_experts = global_num_experts // ep_size - remainder = global_num_experts % ep_size - local_num_experts = base_experts + 1 if ep_rank < remainder else base_experts - - # Create a tensor of size num_experts filled with -1 - expert_map = torch.full((global_num_experts,), -1, dtype=torch.int32) - # Create an expert map for the local experts - if expert_placement_strategy == "linear": - start_idx = ep_rank * base_experts + min(ep_rank, remainder) - expert_map[start_idx : start_idx + local_num_experts] = torch.arange( - 0, local_num_experts, dtype=torch.int32 - ) - elif expert_placement_strategy == "round_robin": - local_log_experts = torch.arange( - ep_rank, global_num_experts, ep_size, dtype=torch.int32 - ) - - expert_map[local_log_experts] = torch.arange( - 0, local_num_experts, dtype=torch.int32 - ) - else: - raise ValueError( - "Unsupported expert placement strategy " - f"'{expert_placement_strategy}', expected one of " - f"{get_args(ExpertPlacementStrategy)}" - ) - - expert_mask = None - if return_expert_mask: - expert_mask = torch.ones( - (global_num_experts + num_fused_shared_experts + 1,), dtype=torch.int32 - ) - expert_mask[-1] = 0 - expert_mask[:global_num_experts] = expert_map > -1 - expert_map = torch.cat( - ( - expert_map, - torch.tensor( - [local_num_experts + i for i in range(num_fused_shared_experts)], - dtype=torch.int32, - ), - ), - dim=0, - ) - - return (local_num_experts, expert_map, expert_mask) - - +# Should be method def determine_expert_placement_strategy( expert_placement_strategy: ExpertPlacementStrategy, moe_parallel_config: FusedMoEParallelConfig, @@ -194,27 +109,6 @@ def determine_expert_placement_strategy( return expert_placement_strategy -def get_compressed_expert_map(expert_map: torch.Tensor) -> str: - """ - Compresses the expert map by removing any -1 entries. - - Args: - expert_map (torch.Tensor): A tensor of shape (global_num_experts,) - mapping from global to local index. Contains -1 for experts not - assigned to the current rank. - - Returns: - str: A string mapping from local to global index. - Using str to support hashing for logging once only. - """ - global_indices = torch.where(expert_map != -1)[0] - local_indices = expert_map[global_indices] - return ", ".join( - f"{local_index.item()}->{global_index.item()}" - for local_index, global_index in zip(local_indices, global_indices) - ) - - # TODO(rob): move this down to the kernel. def maybe_roundup_hidden_size( hidden_size: int, @@ -372,7 +266,7 @@ def __init__( compilation_config.static_forward_context[prefix] = self compilation_config.static_all_moe_layers.append(prefix) self.layer_name = prefix - self.expert_placement_strategy: ExpertPlacementStrategy = ( + expert_placement_strategy: ExpertPlacementStrategy = ( vllm_config.parallel_config.expert_placement_strategy ) @@ -412,27 +306,37 @@ def __init__( "Redundant experts are only supported with EPLB." ) - self.expert_placement_strategy = determine_expert_placement_strategy( - expert_placement_strategy=self.expert_placement_strategy, + # Determine expert placement strategy before creating manager + expert_placement_strategy_effective = determine_expert_placement_strategy( + expert_placement_strategy=expert_placement_strategy, moe_parallel_config=self.moe_parallel_config, num_expert_group=num_expert_group, num_redundant_experts=num_redundant_experts, enable_eplb=eplb_manager is not None, ) + else: + expert_placement_strategy_effective = expert_placement_strategy - self._expert_map: torch.Tensor | None - local_num_experts, expert_map, expert_mask = determine_expert_map( - ep_size=self.ep_size, - ep_rank=self.ep_rank, - global_num_experts=self.global_num_experts, - expert_placement_strategy=self.expert_placement_strategy, - num_fused_shared_experts=self.num_fused_shared_experts, - return_expert_mask=self.rocm_aiter_fmoe_enabled, - ) - self.local_num_experts = local_num_experts - self.register_buffer("_expert_map", expert_map) - self.register_buffer("expert_mask", expert_mask) - self._maybe_init_expert_routing_tables() + # Create expert map manager + self.expert_map_manager = ExpertMapManager( + global_num_experts=self.global_num_experts, + logical_num_experts=self.logical_num_experts, + moe_parallel_config=self.moe_parallel_config, + placement_strategy=expert_placement_strategy_effective, + num_fused_shared_experts=self.num_fused_shared_experts, + rocm_aiter_enabled=self.rocm_aiter_fmoe_enabled, + device=vllm_config.device_config.device, + ) + + # Register buffers for state_dict compatibility + if self.expert_map_manager.expert_map is not None: + self.register_buffer("_expert_map", self.expert_map_manager.expert_map) + + if self.expert_map_manager.expert_mask is not None: + self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) + + # Log EP configuration (move into EMM?) + if self.use_ep: logger.info_once( "[EP Rank %s/%s] Expert parallelism is enabled. Expert " "placement strategy: %s. Local/global" @@ -440,25 +344,22 @@ def __init__( " %s.", self.ep_rank, self.ep_size, - self.expert_placement_strategy, - self.local_num_experts, - self.global_num_experts, - get_compressed_expert_map(self._expert_map), - ) - else: - self.local_num_experts, self._expert_map, self.expert_mask = ( - self.global_num_experts, - None, - None, + self.expert_map_manager.placement_strategy, + self.expert_map_manager.local_num_experts, + self.expert_map_manager.global_num_experts, + self.expert_map_manager.get_compressed_map_string(), ) self.top_k = top_k + # move into EMM? self._init_aiter_shared_experts_topK_buffer( vllm_config=vllm_config, dp_size=dp_size_ ) + # XXXXX move into EMM if self.use_ep and self.rocm_aiter_fmoe_enabled: - assert self.expert_mask is None or torch.all( + expert_mask = self.expert_map_manager.expert_mask + assert expert_mask is None or torch.all( (expert_mask == 0) | (expert_mask == 1) ), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s." @@ -723,6 +624,35 @@ def ep_rank(self): def use_ep(self): return self.moe_parallel_config.use_ep + # XXXXXXXXX keep this separate + @property + def local_num_experts(self) -> int: + """Number of experts assigned to this rank.""" + return self.expert_map_manager.local_num_experts + + @property + def expert_placement_strategy(self) -> ExpertPlacementStrategy: + """Expert placement strategy ('linear' or 'round_robin').""" + return self.expert_map_manager.placement_strategy + + @property + def expert_global_to_physical(self) -> torch.Tensor | None: + """Routing table: global expert ID to physical expert ID.""" + tables = self.expert_map_manager.routing_tables + return tables[0] if tables else None + + @property + def expert_physical_to_global(self) -> torch.Tensor | None: + """Routing table: physical expert ID to global expert ID.""" + tables = self.expert_map_manager.routing_tables + return tables[1] if tables else None + + @property + def expert_local_to_global(self) -> torch.Tensor | None: + """Routing table: local expert ID to global expert ID.""" + tables = self.expert_map_manager.routing_tables + return tables[2] if tables else None + @property def is_internal_router(self) -> bool: # By default, router/gate is called before FusedMoE forward pass @@ -739,39 +669,19 @@ def shared_experts(self) -> SharedExperts | None: def _maybe_init_expert_routing_tables( self, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: - # Currently routing_tables only needed for round-robin expert placement - # with DeepEP-ll or NIXL EP all2all backends. - if self.expert_placement_strategy != "round_robin" or ( - not self.moe_parallel_config.use_deepep_ll_kernels - and not self.moe_parallel_config.use_nixl_ep_kernels - ): - return None + """Get routing tables (already initialized by manager).""" + # Return routing tables from manager + routing_tables = self.expert_map_manager.routing_tables - if hasattr(self, "expert_global_to_physical"): - return cast( - tuple[torch.Tensor, torch.Tensor, torch.Tensor], - ( - self.expert_global_to_physical, - self.expert_physical_to_global, - self.expert_local_to_global, - ), - ) - - if self._expert_map is None: + if routing_tables is None: return None - routing_tables = self.ensure_round_robin_expert_routing_tables( - global_num_experts=self.global_num_experts, - ep_size=self.ep_size, - ep_rank=self.ep_rank, - local_num_experts=self.local_num_experts, - device=self._expert_map.device, - ) - - global_to_physical, physical_to_global, local_global = routing_tables - self.register_buffer("expert_global_to_physical", global_to_physical) - self.register_buffer("expert_physical_to_global", physical_to_global) - self.register_buffer("expert_local_to_global", local_global) + # Register buffers for backward compatibility if not already registered + if not hasattr(self, "expert_global_to_physical"): + global_to_physical, physical_to_global, local_global = routing_tables + self.register_buffer("expert_global_to_physical", global_to_physical) + self.register_buffer("expert_physical_to_global", physical_to_global) + self.register_buffer("expert_local_to_global", local_global) return routing_tables @@ -815,26 +725,23 @@ def ensure_round_robin_expert_routing_tables( return (global_to_physical, physical_to_global, local_global) def update_expert_map(self): - # ep_size and ep_rank should already be updated - assert self._expert_map is not None - with self._expert_map.device: - local_num_experts, expert_map, expert_mask = determine_expert_map( - ep_size=self.ep_size, - ep_rank=self.ep_rank, - global_num_experts=self.global_num_experts, - expert_placement_strategy=self.expert_placement_strategy, - num_fused_shared_experts=self.num_fused_shared_experts, - return_expert_mask=self.rocm_aiter_fmoe_enabled, + """Update expert mappings for new EP configuration.""" + # ep_size and ep_rank should already be updated in moe_parallel_config + self.expert_map_manager.update() + + # Re-register buffers for state_dict compatibility + self.register_buffer("_expert_map", self.expert_map_manager.expert_map) + self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) + + # Update routing table buffers if needed + self._maybe_init_expert_routing_tables() + + # Handle AITER shared experts if needed + if self.aiter_fmoe_shared_expert_enabled: + self._init_aiter_shared_experts_topK_buffer( + vllm_config=get_current_vllm_config(), + dp_size=get_dp_group().world_size, ) - self.local_num_experts = local_num_experts - self.register_buffer("_expert_map", expert_map) - self.register_buffer("expert_mask", expert_mask) - self._maybe_init_expert_routing_tables() - if self.aiter_fmoe_shared_expert_enabled: - self._init_aiter_shared_experts_topK_buffer( - vllm_config=get_current_vllm_config(), - dp_size=get_dp_group().world_size, - ) def _load_per_tensor_weight_scale( self, @@ -1008,9 +915,8 @@ def _load_g_idx( expert_data.copy_(loaded_weight) def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: - if self._expert_map is None: - return expert_id - return self._expert_map[expert_id].item() + """Map global expert ID to local expert ID.""" + return self.expert_map_manager.map_global_to_local(expert_id) def _init_aiter_shared_experts_topK_buffer( self, vllm_config: VllmConfig, dp_size: int @@ -1027,7 +933,8 @@ def _init_aiter_shared_experts_topK_buffer( * dp_size, is_EP=self.use_ep, ) - self.local_num_experts += self.num_fused_shared_experts + # HACK + self.expert_map_manager._local_num_experts += self.num_fused_shared_experts @overload def weight_loader( @@ -1428,7 +1335,9 @@ def forward_native( @property def expert_map(self) -> torch.Tensor | None: return ( - self._expert_map if not self.rocm_aiter_fmoe_enabled else self.expert_mask + self.expert_map_manager.expert_map + if not self.rocm_aiter_fmoe_enabled + else self.expert_map_manager.expert_mask ) def forward_cuda( From a8c52807f71cb25a98e0dc261548182f3041d625 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 26 Mar 2026 16:13:24 +0000 Subject: [PATCH 086/191] comment Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 88da92271967..e00de654d143 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -71,7 +71,7 @@ class FusedMoeWeightScaleSupported(Enum): BLOCK = "block" -# Should be method +# Should be method? only used in layer def determine_expert_placement_strategy( expert_placement_strategy: ExpertPlacementStrategy, moe_parallel_config: FusedMoEParallelConfig, From 7164218c9fbb9ab1b97c75056c1589d27e36faea Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 26 Mar 2026 17:23:40 +0000 Subject: [PATCH 087/191] reshuffle code Signed-off-by: Bill Nell --- .../layers/fused_moe/expert_map_manager.py | 4 +- vllm/model_executor/layers/fused_moe/layer.py | 285 +++++++++--------- .../fused_moe/runner/moe_runner_base.py | 2 +- 3 files changed, 141 insertions(+), 150 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index c4188db2dba3..54ca501a125d 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -367,9 +367,9 @@ def _maybe_init_routing_tables(self) -> None: if self._expert_map is None: return - self._routing_tables = self._build_routing_tables() + self._routing_tables = self._ensure_round_robin_expert_routing_tables() - def _build_routing_tables( + def _ensure_round_robin_expert_routing_tables( self, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """Build routing tables for round-robin placement.""" diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index e00de654d143..607791034bbf 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -145,6 +145,11 @@ def maybe_roundup_hidden_size( return hidden_size +class RoutedExperts(torch.nn.Module): + def __init__(self): + super().__init__() + + # --8<-- [start:fused_moe] @CustomOp.register("fused_moe") class FusedMoE(CustomOp): @@ -411,6 +416,7 @@ def __init__( ) self.routing_method_type: RoutingMethodType = router.routing_method_type + # TODO(bnell): is this redundant now? # When using zero experts, slice e_score_correction_bias to cover # only real experts, for compatibility with monolithic kernels that # read it directly. @@ -457,6 +463,8 @@ def __init__( # TODO: in_dtype == out_dtype? disable_inplace=disable_inplace() or shared_experts is not None, ) + + # Move XXXXXXXXXXXXX if self.moe_config.use_mori_kernels: assert self.rocm_aiter_fmoe_enabled, ( "Mori needs to be used with aiter fused_moe for now." @@ -479,11 +487,13 @@ def __init__( # TODO(bnell): only for weight loading. how to get around this? self.quant_method = quant_method + # Move XXXXXXXXXXXXX if not self.moe_config.is_act_and_mul and not current_platform.is_cuda_alike(): raise NotImplementedError( "is_act_and_mul=False is supported only for CUDA and ROCm for now" ) + # Move XXXXXXXXXXXXX if eplb_manager is not None and not quant_method.supports_eplb: # TODO: Add support for additional quantization methods. # The implementation for other quantization methods does not @@ -505,12 +515,9 @@ def __init__( "weight_loader": self.weight_loader, "global_num_experts": self.global_num_experts, } + # need full intermediate size pre-sharding for WNA16 act order - if quant_method.__class__.__name__ in ( - "GPTQMarlinMoEMethod", - "CompressedTensorsWNA16MarlinMoEMethod", - "CompressedTensorsWNA16MoEMethod", - ): + if self._needs_intermediate_size_param(quant_method): moe_quant_params["intermediate_size_full"] = intermediate_size quant_method.create_weights(layer=self, **moe_quant_params) @@ -532,6 +539,26 @@ def __init__( routed_scaling_factor=routed_scaling_factor, ) + # TODO(bnell): make this a method on quant_method + def _needs_intermediate_size_param(self, quant_method: FusedMoEMethodBase) -> bool: + return quant_method.__class__.__name__ in ( + "GPTQMarlinMoEMethod", + "CompressedTensorsWNA16MarlinMoEMethod", + "CompressedTensorsWNA16MoEMethod", + ) + + def extra_repr(self) -> str: + s = ( + f"global_num_experts={self.global_num_experts}, " + f"local_num_experts={self.local_num_experts}, " + f"top_k={self.top_k}, " + f"intermediate_size_per_partition={self.intermediate_size_per_partition}, " # noqa: E501 + f"tp_size={self.tp_size},\n" + f"ep_size={self.ep_size}, " + ) + + return s + def _get_quant_method( self, prefix: str, @@ -557,6 +584,14 @@ def _get_quant_method( def _replace_quant_method(self, mk: FusedMoEMethodBase): self._runner._replace_quant_method(mk) + def _ensure_moe_quant_config_init(self): + if self._runner.quant_method.moe_quant_config is None: + # Note: the moe_quant_config can't be constructed until after + # weight loading post processing. + self._runner.quant_method.moe_quant_config = ( + self._runner.quant_method.get_fused_moe_quant_config(self) + ) + # Note: maybe_init_modular_kernel should only be called by # prepare_communication_buffer_for_model. # This is called after all weight loading and post-processing, so it @@ -570,7 +605,7 @@ def maybe_init_modular_kernel(self) -> None: ): return None - self.ensure_moe_quant_config_init() + self._ensure_moe_quant_config_init() # routing_tables only needed for round-robin expert placement with # DeepEP all2all backend. routing_tables = self._maybe_init_expert_routing_tables() @@ -597,6 +632,10 @@ def maybe_init_modular_kernel(self) -> None: ) ) + # + # Properties + # + @property def layer_id(self): # Delayed import to avoid circular dependency @@ -666,6 +705,18 @@ def is_monolithic(self) -> bool: def shared_experts(self) -> SharedExperts | None: return self._runner.shared_experts + # + # Expert maps + # + + @property + def expert_map(self) -> torch.Tensor | None: + return ( + self.expert_map_manager.expert_map + if not self.rocm_aiter_fmoe_enabled + else self.expert_map_manager.expert_mask + ) + def _maybe_init_expert_routing_tables( self, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: @@ -685,45 +736,6 @@ def _maybe_init_expert_routing_tables( return routing_tables - @staticmethod - def ensure_round_robin_expert_routing_tables( - global_num_experts: int, - ep_size: int, - ep_rank: int, - local_num_experts: int, - device: torch.device | None = None, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - device_kwargs = {"device": device} if device is not None else {} - global_indices = torch.arange( - global_num_experts, dtype=torch.long, **device_kwargs - ) - owner = torch.remainder(global_indices, ep_size) - local_index = torch.div(global_indices, ep_size, rounding_mode="floor") - base = global_num_experts // ep_size - remainder = global_num_experts % ep_size - physical_offset = owner * base - if remainder > 0: - remainder_tensor = torch.tensor( - remainder, dtype=torch.long, **device_kwargs - ) - physical_offset = physical_offset + torch.minimum(owner, remainder_tensor) - - global_to_physical = physical_offset + local_index - physical_to_global = torch.empty_like(global_to_physical) - physical_to_global[global_to_physical] = global_indices - - local_global = torch.arange( - ep_rank, - global_num_experts, - ep_size, - dtype=torch.long, - **device_kwargs, - ) - if local_global.numel() != local_num_experts: - local_global = local_global[:local_num_experts] - - return (global_to_physical, physical_to_global, local_global) - def update_expert_map(self): """Update expert mappings for new EP configuration.""" # ep_size and ep_rank should already be updated in moe_parallel_config @@ -743,6 +755,84 @@ def update_expert_map(self): dp_size=get_dp_group().world_size, ) + def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: + """Map global expert ID to local expert ID.""" + return self.expert_map_manager.map_global_to_local(expert_id) + + # + # EPLB + # + + def _init_aiter_shared_experts_topK_buffer( + self, vllm_config: VllmConfig, dp_size: int + ): + if self.num_fused_shared_experts > 0: + init_aiter_topK_meta_data( + n_routed_experts=self.global_num_experts, + n_shared_experts=self.num_fused_shared_experts, + top_k=self.top_k, + tp_rank=self.ep_rank if self.use_ep else self.tp_rank, + tp_size=self.ep_size if self.use_ep else self.tp_size, + shared_experts_score=1.0, + max_num_tokens=vllm_config.scheduler_config.max_num_batched_tokens + * dp_size, + is_EP=self.use_ep, + ) + # HACK + self.expert_map_manager._local_num_experts += self.num_fused_shared_experts + + def get_expert_weights(self) -> Iterable[torch.Tensor]: + """Delegate to EPLB manager.""" + if self._runner.router.eplb_manager is not None: + return self._runner.router.eplb_manager.get_expert_weights(self) + else: + return [] + + def set_eplb_state( + self, + moe_layer_idx: int, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + """ + Register the EPLB state in this layer. + + This is used later in forward pass, where we get the expert mapping + and record the load metrics in `expert_load_view`. + """ + if self._runner.router.eplb_manager is not None: + self._runner.router.eplb_manager.set_state( + moe_layer_idx, + expert_load_view, + logical_to_physical_map, + logical_replica_count, + ) + + @classmethod + def make_expert_params_mapping( + cls, + model: torch.nn.Module, + ckpt_gate_proj_name: str, + ckpt_down_proj_name: str, + ckpt_up_proj_name: str, + num_experts: int, + num_redundant_experts: int = 0, + ) -> list[tuple[str, str, int, str]]: + """Delegate to EPLB manager.""" + return EplbManager.make_expert_params_mapping( + model, + ckpt_gate_proj_name, + ckpt_down_proj_name, + ckpt_up_proj_name, + num_experts, + num_redundant_experts, + ) + + # + # Weight Loading + # + def _load_per_tensor_weight_scale( self, shard_id: str, @@ -914,28 +1004,6 @@ def _load_g_idx( assert shard_id in ("w1", "w3") expert_data.copy_(loaded_weight) - def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: - """Map global expert ID to local expert ID.""" - return self.expert_map_manager.map_global_to_local(expert_id) - - def _init_aiter_shared_experts_topK_buffer( - self, vllm_config: VllmConfig, dp_size: int - ): - if self.num_fused_shared_experts > 0: - init_aiter_topK_meta_data( - n_routed_experts=self.global_num_experts, - n_shared_experts=self.num_fused_shared_experts, - top_k=self.top_k, - tp_rank=self.ep_rank if self.use_ep else self.tp_rank, - tp_size=self.ep_size if self.use_ep else self.tp_size, - shared_experts_score=1.0, - max_num_tokens=vllm_config.scheduler_config.max_num_batched_tokens - * dp_size, - is_EP=self.use_ep, - ) - # HACK - self.expert_map_manager._local_num_experts += self.num_fused_shared_experts - @overload def weight_loader( self, @@ -1281,46 +1349,9 @@ def load_weights( ) yield param_name - def get_expert_weights(self) -> Iterable[torch.Tensor]: - """Delegate to EPLB manager.""" - if self._runner.router.eplb_manager is not None: - return self._runner.router.eplb_manager.get_expert_weights(self) - else: - return [] - - def set_eplb_state( - self, - moe_layer_idx: int, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - """ - Register the EPLB state in this layer. - - This is used later in forward pass, where we get the expert mapping - and record the load metrics in `expert_load_view`. - """ - if self._runner.router.eplb_manager is not None: - self._runner.router.eplb_manager.set_state( - moe_layer_idx, - expert_load_view, - logical_to_physical_map, - logical_replica_count, - ) - - def ensure_moe_quant_config_init(self): - if self._runner.quant_method.moe_quant_config is None: - # Note: the moe_quant_config can't be constructed until after - # weight loading post processing. - self._runner.quant_method.moe_quant_config = ( - self._runner.quant_method.get_fused_moe_quant_config(self) - ) - - # @property - # def moe_quant_config(self) -> FusedMoEQuantConfig | None: - # self.ensure_moe_quant_config_init() - # return self._runner.quant_method.moe_quant_config + # + # Execution + # def forward_native( self, @@ -1332,14 +1363,6 @@ def forward_native( router_logits, ) - @property - def expert_map(self) -> torch.Tensor | None: - return ( - self.expert_map_manager.expert_map - if not self.rocm_aiter_fmoe_enabled - else self.expert_map_manager.expert_mask - ) - def forward_cuda( self, hidden_states: torch.Tensor, @@ -1347,38 +1370,6 @@ def forward_cuda( ) -> torch.Tensor: return self.forward_native(hidden_states, router_logits) - @classmethod - def make_expert_params_mapping( - cls, - model: torch.nn.Module, - ckpt_gate_proj_name: str, - ckpt_down_proj_name: str, - ckpt_up_proj_name: str, - num_experts: int, - num_redundant_experts: int = 0, - ) -> list[tuple[str, str, int, str]]: - """Delegate to EPLB manager.""" - return EplbManager.make_expert_params_mapping( - model, - ckpt_gate_proj_name, - ckpt_down_proj_name, - ckpt_up_proj_name, - num_experts, - num_redundant_experts, - ) - - def extra_repr(self) -> str: - s = ( - f"global_num_experts={self.global_num_experts}, " - f"local_num_experts={self.local_num_experts}, " - f"top_k={self.top_k}, " - f"intermediate_size_per_partition={self.intermediate_size_per_partition}, " # noqa: E501 - f"tp_size={self.tp_size},\n" - f"ep_size={self.ep_size}, " - ) - - return s - # Mark the FusedMoE weight_loader as supporting MoE-specific parameters # to avoid expensive runtime reflection in model loading code diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index b1bf3838418f..c728a81375a6 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -655,7 +655,7 @@ def _forward_dispatch( the sequence-parallel context. """ # TODO(bnell): this can be removed after MK migration is complete. - layer.ensure_moe_quant_config_init() + layer._ensure_moe_quant_config_init() with self._sequence_parallel_context(): return self._forward_impl( From 2c076245f85f7d7c118b5dde9d42be7166b4dd90 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 26 Mar 2026 20:10:09 +0000 Subject: [PATCH 088/191] make RoutedExperts class Signed-off-by: Bill Nell --- tests/kernels/moe/test_moe_layer.py | 4 +- vllm/lora/layers/fused_moe.py | 4 +- .../layers/fused_moe/__init__.py | 2 + .../layers/fused_moe/fused_moe_method_base.py | 28 +- vllm/model_executor/layers/fused_moe/layer.py | 2298 +++++++++-------- .../fused_moe/runner/moe_runner_base.py | 12 +- .../fused_moe/unquantized_fused_moe_method.py | 10 +- .../model_executor/layers/quantization/awq.py | 2 +- .../layers/quantization/awq_marlin.py | 3 +- .../layers/quantization/bitsandbytes.py | 3 +- .../compressed_tensors/compressed_tensors.py | 2 +- .../compressed_tensors_moe.py | 42 +- .../layers/quantization/experts_int8.py | 3 +- .../model_executor/layers/quantization/fp8.py | 7 +- .../layers/quantization/gguf.py | 3 +- .../layers/quantization/gptq.py | 2 +- .../layers/quantization/gptq_marlin.py | 3 +- .../layers/quantization/modelopt.py | 17 +- .../layers/quantization/moe_wna16.py | 3 +- .../layers/quantization/mxfp4.py | 7 +- .../layers/quantization/mxfp8.py | 1 + .../layers/quantization/quark/quark.py | 2 +- .../layers/quantization/quark/quark_moe.py | 12 +- .../quantization/utils/flashinfer_fp4_moe.py | 4 +- .../model_executor/warmup/deep_gemm_warmup.py | 5 +- vllm/v1/worker/gpu_model_runner.py | 9 +- 26 files changed, 1363 insertions(+), 1125 deletions(-) diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index ed4caed2eab2..13b023c9c855 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -884,11 +884,11 @@ def make_fused_moe_layer( ("w2_input_scale", qw.w2_input_scale), ]: if value is not None: - layer.register_parameter( + layer.routed_experts.register_parameter( name, torch.nn.Parameter(value, requires_grad=False) ) - layer.quant_method.process_weights_after_loading(layer) + layer.quant_method.process_weights_after_loading(layer.routed_experts) # Temporary hack until #36286 or #36732 lands if quantization is None: diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index c7b2ac198bb8..19780c522469 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -109,8 +109,8 @@ def _get_lora_moe_configs( else: # fall back to the default config get_config_func = functools.partial( try_get_optimal_moe_lora_config, - w1_shape=layer.w13_weight.size(), - w2_shape=layer.w2_weight.size(), + w1_shape=layer.routed_experts.w13_weight.size(), + w2_shape=layer.routed_experts.w2_weight.size(), rank=rank, top_k=top_k, dtype=config_dtype, diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index a28fd06930c6..28bd2caf0a99 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -19,6 +19,7 @@ from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoeWeightScaleSupported, + RoutedExperts, ) from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEActivationFormat, @@ -63,6 +64,7 @@ def get_config() -> dict[str, Any] | None: "FusedMoEActivationFormat", "FusedMoEPrepareAndFinalizeModular", "GateLinear", + "RoutedExperts", "RoutingMethodType", "SharedFusedMoE", "activation_without_mul", diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py index 8f40a63fa211..b5fd754a387d 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py @@ -123,18 +123,42 @@ def is_monolithic(self) -> bool: def apply( self, - layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: + """ + Apply the MoE operation using modular kernels. + + Args: + layer: RoutedExperts instance containing weight parameters + x: Input tensor + topk_weights: Expert weights from router + topk_ids: Selected expert IDs from router + shared_experts_input: Input for shared experts (if any) + + Returns: + Output tensor from routed experts + """ raise NotImplementedError def apply_monolithic( self, - layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor: + """ + Apply the MoE operation using monolithic kernels. + + Args: + layer: RoutedExperts instance containing weight parameters + x: Input tensor + router_logits: Router logits (routing done internally) + + Returns: + Output tensor from routed experts + """ raise NotImplementedError diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 607791034bbf..a3145682daad 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -146,1208 +146,1410 @@ def maybe_roundup_hidden_size( class RoutedExperts(torch.nn.Module): - def __init__(self): - super().__init__() - - -# --8<-- [start:fused_moe] -@CustomOp.register("fused_moe") -class FusedMoE(CustomOp): - """FusedMoE layer for MoE models. - - This layer contains both MergedColumnParallel weights (gate_up_proj / - w13) and RowParallelLinear weights (down_proj/ w2). + """ + Container for routed expert weights and execution logic. - Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We - copy that naming convention here and handle any remapping in the - load_weights function in each model implementation. + This module owns the expert weight parameters (w13_weight, w2_weight, scales, etc.) + and handles: + - Loading checkpoint weights into parameters + - Executing routed experts via quant_method.apply() - Args: - num_experts: Number of experts in the model - top_k: Number of experts selected for each token - hidden_size: Input hidden state size of the transformer - intermediate_size: Intermediate size of the experts - params_dtype: Data type for the parameters. - renormalize: Whether to renormalize the logits in the fused_moe kernel - quant_config: Quantization configure. - enable_eplb: Whether to enable expert parallelism load balancer. - router_logits_dtype: Data type for router logits buffers. + Weight parameters are registered on this module via _ParameterRegistrationWrapper + during FusedMoE initialization. """ - # --8<-- [end:fused_moe] - def __init__( self, - num_experts: int, # Global number of experts - top_k: int, - hidden_size: int, + layer_name: str, + params_dtype: torch.dtype, + unpadded_hidden_size: int, # put in moe_config? intermediate_size: int, - params_dtype: torch.dtype | None = None, - renormalize: bool = True, - use_grouped_topk: bool = False, - num_expert_group: int | None = None, - topk_group: int | None = None, - quant_config: QuantizationConfig | None = None, - tp_size: int | None = None, - ep_size: int | None = None, - dp_size: int | None = None, - pcp_size: int | None = None, - prefix: str = "", - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - is_act_and_mul: bool = True, - enable_eplb: bool = False, - num_redundant_experts: int = 0, - has_bias: bool = False, - is_sequence_parallel=False, - expert_mapping: list[tuple[str, str, int, str]] | None = None, - n_shared_experts: int | None = None, - router_logits_dtype: torch.dtype | None = None, - gate: torch.nn.Module | None = None, - shared_experts: torch.nn.Module | None = None, - routed_input_transform: torch.nn.Module | None = None, - routed_output_transform: torch.nn.Module | None = None, - apply_scale_to_output: bool = False, - zero_expert_type: str | None = None, + moe_config: FusedMoEConfig, + quant_config: QuantizationConfig | None, + quant_method: FusedMoEMethodBase, + expert_map_manager: ExpertMapManager, + **kwargs, ): super().__init__() + self.layer_name = layer_name + self.moe_config = moe_config + self.quant_config = quant_config + self.quant_method = quant_method + self.expert_map_manager = expert_map_manager + self.hidden_size = moe_config.hidden_dim + self.intermediate_size_per_partition = ( + moe_config.intermediate_size_per_partition + ) + self.global_num_experts = moe_config.num_experts + self.local_num_experts = moe_config.num_local_experts - if params_dtype is None: - params_dtype = torch.get_default_dtype() - self.params_dtype = params_dtype + # Register buffers for state_dict compatibility + if self.expert_map_manager.expert_map is not None: + self.register_buffer("_expert_map", self.expert_map_manager.expert_map) - vllm_config = get_current_vllm_config() + if self.expert_map_manager.expert_mask is not None: + self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) - # FIXME (varun): We should have a better way of inferring the activation - # datatype. This works for now as the tensor datatype entering the MoE - # operation is typically unquantized (i.e. float16/bfloat16). - if vllm_config.model_config is not None: - moe_in_dtype = vllm_config.model_config.dtype - else: - # TODO (bnell): This is a hack to get test_mixtral_moe to work - # since model_config is not set in the pytest test. - moe_in_dtype = params_dtype + # Bit of hack until things are settled + self.__dict__.update(kwargs) - tp_size_ = ( - tp_size if tp_size is not None else get_tensor_model_parallel_world_size() - ) - dp_size_ = dp_size if dp_size is not None else get_dp_group().world_size - pcp_size_ = pcp_size if pcp_size is not None else get_pcp_group().world_size + moe_quant_params = { + "num_experts": moe_config.num_local_experts, + "hidden_size": moe_config.hidden_dim, + "unpadded_hidden_size": unpadded_hidden_size, + "intermediate_size_per_partition": ( + moe_config.intermediate_size_per_partition + ), + "params_dtype": params_dtype, + "weight_loader": self.weight_loader, + "global_num_experts": moe_config.num_experts, + } - self.is_sequence_parallel = is_sequence_parallel - self.sp_size = tp_size_ if is_sequence_parallel else 1 + # need full intermediate size pre-sharding for WNA16 act order + if self._needs_intermediate_size_param(quant_method): + moe_quant_params["intermediate_size_full"] = intermediate_size - self.moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make( - tp_size_=tp_size_, - pcp_size_=pcp_size_, - dp_size_=dp_size_, - sp_size_=self.sp_size, - vllm_parallel_config=vllm_config.parallel_config, + quant_method.create_weights(layer=self, **moe_quant_params) + + # TODO(bnell): make this a method on quant_method + def _needs_intermediate_size_param(self, quant_method: FusedMoEMethodBase) -> bool: + return quant_method.__class__.__name__ in ( + "GPTQMarlinMoEMethod", + "CompressedTensorsWNA16MarlinMoEMethod", + "CompressedTensorsWNA16MoEMethod", ) - assert self.moe_parallel_config.is_sequence_parallel == is_sequence_parallel + def _ensure_moe_quant_config_init(self): + if self.quant_method.moe_quant_config is None: + # Note: the moe_quant_config can't be constructed until after + # weight loading post processing. + self.quant_method.moe_quant_config = ( + self.quant_method.get_fused_moe_quant_config(self) + ) - logger.debug("FusedMoEParallelConfig = %s", str(self.moe_parallel_config)) + @property + def expert_map(self) -> torch.Tensor | None: + return ( + self.expert_map_manager.expert_map + if not self.rocm_aiter_fmoe_enabled + else self.expert_map_manager.expert_mask + ) - self.global_num_experts = num_experts + num_redundant_experts - self.logical_num_experts = num_experts + def _maybe_init_expert_routing_tables( + self, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: + """Get routing tables (already initialized by manager).""" + # Return routing tables from manager + routing_tables = self.expert_map_manager.routing_tables - # Initialize EPLB manager (or None?) - eplb_manager: EplbManager | None = None - if enable_eplb: - eplb_manager = EplbManager(num_redundant_experts=num_redundant_experts) + if routing_tables is None: + return None - # Expert mapping used in self.load_weights - self.expert_mapping = expert_mapping + # Register buffers for backward compatibility if not already registered + if not hasattr(self, "expert_global_to_physical"): + global_to_physical, physical_to_global, local_global = routing_tables + self.register_buffer("expert_global_to_physical", global_to_physical) + self.register_buffer("expert_physical_to_global", physical_to_global) + self.register_buffer("expert_local_to_global", local_global) - # For smuggling this layer into the fused moe custom op - compilation_config = vllm_config.compilation_config - if prefix in compilation_config.static_forward_context: - raise ValueError("Duplicate layer name: {}".format(prefix)) - compilation_config.static_forward_context[prefix] = self - compilation_config.static_all_moe_layers.append(prefix) - self.layer_name = prefix - expert_placement_strategy: ExpertPlacementStrategy = ( - vllm_config.parallel_config.expert_placement_strategy - ) + return routing_tables - # ROCm aiter shared experts fusion - # AITER only supports gated activations (silu/gelu), so disable it - # for non-gated MoE (is_act_and_mul=False) - self.rocm_aiter_fmoe_enabled = ( - rocm_aiter_ops.is_fused_moe_enabled() and is_act_and_mul - ) - self.aiter_fmoe_shared_expert_enabled = ( - rocm_aiter_ops.is_fusion_moe_shared_experts_enabled() and is_act_and_mul - ) + def update_expert_map(self): + """Update expert mappings for new EP configuration.""" + # ep_size and ep_rank should already be updated in moe_parallel_config + self.expert_map_manager.update() - self.num_fused_shared_experts = ( - n_shared_experts - if n_shared_experts is not None and self.aiter_fmoe_shared_expert_enabled - else 0 - ) - if ( - not self.aiter_fmoe_shared_expert_enabled - and self.num_fused_shared_experts != 0 - ): - raise ValueError( - "n_shared_experts is only supported on ROCm aiter when " - "VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS is enabled" - ) + # Re-register buffers for state_dict compatibility + self.register_buffer("_expert_map", self.expert_map_manager.expert_map) + self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) - # Determine expert maps - if self.use_ep: - if eplb_manager is not None: - # Validate EPLB configuration - eplb_manager.validate_configuration( - self.global_num_experts, self.ep_size - ) - else: - assert num_redundant_experts == 0, ( - "Redundant experts are only supported with EPLB." - ) + # Update routing table buffers if needed + self._maybe_init_expert_routing_tables() - # Determine expert placement strategy before creating manager - expert_placement_strategy_effective = determine_expert_placement_strategy( - expert_placement_strategy=expert_placement_strategy, - moe_parallel_config=self.moe_parallel_config, - num_expert_group=num_expert_group, - num_redundant_experts=num_redundant_experts, - enable_eplb=eplb_manager is not None, + # Handle AITER shared experts if needed + if self.aiter_fmoe_shared_expert_enabled: + self._init_aiter_shared_experts_topK_buffer( + vllm_config=get_current_vllm_config(), + dp_size=get_dp_group().world_size, ) - else: - expert_placement_strategy_effective = expert_placement_strategy - # Create expert map manager - self.expert_map_manager = ExpertMapManager( - global_num_experts=self.global_num_experts, - logical_num_experts=self.logical_num_experts, - moe_parallel_config=self.moe_parallel_config, - placement_strategy=expert_placement_strategy_effective, - num_fused_shared_experts=self.num_fused_shared_experts, - rocm_aiter_enabled=self.rocm_aiter_fmoe_enabled, - device=vllm_config.device_config.device, - ) + def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: + """Map global expert ID to local expert ID.""" + return self.expert_map_manager.map_global_to_local(expert_id) - # Register buffers for state_dict compatibility - if self.expert_map_manager.expert_map is not None: - self.register_buffer("_expert_map", self.expert_map_manager.expert_map) + # + # Weight Loading Methods + # - if self.expert_map_manager.expert_mask is not None: - self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) + def _load_per_tensor_weight_scale( + self, + shard_id: str, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + expert_id: int, + ): + param_data = param.data + # for per tensor weight quantization + if shard_id in ("w1", "w3"): + # We have to keep the weight scales of w1 and w3 because + # we need to re-quantize w1/w3 weights after weight loading. + idx = 0 if shard_id == "w1" else 1 + param_data[expert_id][idx] = loaded_weight + # If we are in the row parallel case (down_proj) + elif shard_id == "w2": + param_data[expert_id] = loaded_weight - # Log EP configuration (move into EMM?) - if self.use_ep: - logger.info_once( - "[EP Rank %s/%s] Expert parallelism is enabled. Expert " - "placement strategy: %s. Local/global" - " number of experts: %s/%s. Experts local to global index map:" - " %s.", - self.ep_rank, - self.ep_size, - self.expert_map_manager.placement_strategy, - self.expert_map_manager.local_num_experts, - self.expert_map_manager.global_num_experts, - self.expert_map_manager.get_compressed_map_string(), + def _load_combined_w13_weight_scale( + self, + shard_dim: int, + loaded_weight: torch.Tensor, + param: torch.Tensor, + tp_rank: int, + ): + """ + Load w13 weight scales assuming that w1 weight scales and w3 weight + scales are stored in the same loaded_weight tensor. + """ + shard_size = param.shape[shard_dim] + loaded_weight = loaded_weight.narrow( + shard_dim, shard_size * tp_rank, shard_size + ) + param.copy_(loaded_weight) + + def _load_model_weight_or_group_weight_scale( + self, + shard_dim: int, + expert_data: torch.Tensor, + shard_id: str, + loaded_weight: torch.Tensor, + tp_rank: int, + load_full_w2: bool = False, + ): + """ + Load grouped weight scales for group quantization or model weights + :param shard_dim: dimension to shard + :param expert_data: parameter for a particular expert + :param shard_id: either w1, w2, or w3 + :param loaded_weight: checkpoint weight to load into the param + :param tp_rank: tensor parallel rank + :param load_full_w2: whether or not the w2 loaded should be sharded. + """ + if shard_id == "w2": + # In the case where we have actorder/g_idx, we do not partition the + # w2 scales, as indicated by `load_full` argument, for all tp cases + self._load_w2( + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=tp_rank, + load_full=load_full_w2, + ) + elif shard_id in ("w1", "w3"): + self._load_w13( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=tp_rank, ) - self.top_k = top_k + def _load_per_channel_weight_scale( + self, + expert_data: torch.Tensor, + shard_dim: int, + shard_id: str, + loaded_weight: torch.Tensor, + tp_rank: int, + ): + # for per channel weight quantization + if shard_id == "w2": + expert_data.copy_(loaded_weight) + elif shard_id in ("w1", "w3"): + self._load_w13( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=tp_rank, + ) - # move into EMM? - self._init_aiter_shared_experts_topK_buffer( - vllm_config=vllm_config, dp_size=dp_size_ - ) - # XXXXX move into EMM - if self.use_ep and self.rocm_aiter_fmoe_enabled: - expert_mask = self.expert_map_manager.expert_mask - assert expert_mask is None or torch.all( - (expert_mask == 0) | (expert_mask == 1) - ), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s." + def _load_w13( + self, + expert_data: torch.Tensor, + shard_dim: int, + shard_id: str, + loaded_weight: torch.Tensor, + tp_rank: int, + load_full: bool = False, + ): + # Index the loaded weight for tp sharding. + # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim + if self.moe_config.is_act_and_mul: + shard_size = expert_data.shape[shard_dim] // 2 + else: + shard_size = expert_data.shape[shard_dim] + # Only narrow if the loaded_weight is not a scalar (0-dim tensor) + # and we're not loading the full weight + if not load_full and loaded_weight.ndim > 0: + loaded_weight = loaded_weight.narrow( + shard_dim, shard_size * tp_rank, shard_size + ) + # Narrow parameter and load. + # w1, gate_proj: Load into first logical weight of w13. + if shard_id == "w1": + expert_data = expert_data.narrow(shard_dim, 0, shard_size) + # w3, up_proj: Load into second logical weight of w13. + else: + assert shard_id == "w3" + expert_data = expert_data.narrow(shard_dim, shard_size, shard_size) + expert_data.copy_(loaded_weight) - assert intermediate_size % self.tp_size == 0 - self.intermediate_size_per_partition = intermediate_size // self.tp_size - self.renormalize = renormalize + def _load_w2( + self, + expert_data: torch.Tensor, + shard_dim: int, + loaded_weight: torch.Tensor, + tp_rank: int, + load_full: bool = False, + ): + # Index the loaded weight for tp sharding. + # down_proj: "RowParallel" so tp sharding on input_dim + # Narrow parameter and load. + shard_size = expert_data.shape[shard_dim] + # Only narrow if the loaded_weight is not a scalar (0-dim tensor) + # and we're not loading the full weight + if not load_full and loaded_weight.ndim > 0: + loaded_weight = loaded_weight.narrow( + shard_dim, shard_size * tp_rank, shard_size + ) + # w2, down_proj: Load into only logical weight of w2. + expert_data.copy_(loaded_weight) - # TODO(bnell): these attributes are only used by monolithic kernels. - # Put them in a MoERouterConfig dataclass? - self.use_grouped_topk = use_grouped_topk - if self.use_grouped_topk: - assert num_expert_group is not None and topk_group is not None - self.num_expert_group = num_expert_group - self.topk_group = topk_group - self.custom_routing_function = custom_routing_function - self.scoring_func = scoring_func - self.routed_scaling_factor = routed_scaling_factor - self.e_score_correction_bias = e_score_correction_bias - # TODO(bnell): end attributes + def _load_single_value( + self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int + ): + param_data = param.data - # Store in runner? - self.apply_router_weight_on_input = apply_router_weight_on_input - self.activation = MoEActivation.from_str(activation) + # Input scales can be loaded directly and should be equal. + param_data[expert_id] = loaded_weight - self._runner: MoERunner + def _load_g_idx( + self, + shard_id: str, + expert_data: torch.Tensor, + shard_dim: int, + loaded_weight: torch.Tensor, + tp_rank: int, + ): + if shard_id == "w2": + self._load_w2( + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=tp_rank, + ) + else: + assert shard_id in ("w1", "w3") + expert_data.copy_(loaded_weight) - # TODO(bnell): we should not have to create a router if the kernel is - # monolithic. - router = create_fused_moe_router( - top_k=top_k, - global_num_experts=self.global_num_experts, - renormalize=renormalize, - use_grouped_topk=use_grouped_topk, - num_expert_group=num_expert_group, - topk_group=topk_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor - if not apply_scale_to_output - else 1.0, - e_score_correction_bias=e_score_correction_bias, - num_fused_shared_experts=self.num_fused_shared_experts, - eplb_manager=eplb_manager, - # TODO(bnell): once we can construct the MK at init time, we - # can make this a value. - indices_type_getter=lambda: self._runner.quant_method.topk_indices_dtype, - zero_expert_type=zero_expert_type, - num_logical_experts=self.logical_num_experts, - ) - self.routing_method_type: RoutingMethodType = router.routing_method_type + @overload + def weight_loader( + self, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, + shard_id: str, + expert_id: int, + return_success: Literal[False], + ) -> None: ... - # TODO(bnell): is this redundant now? - # When using zero experts, slice e_score_correction_bias to cover - # only real experts, for compatibility with monolithic kernels that - # read it directly. - if zero_expert_type is not None and e_score_correction_bias is not None: - self.e_score_correction_bias = e_score_correction_bias[ - : self.logical_num_experts - ] + @overload + def weight_loader( + self, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, + shard_id: str, + expert_id: int, + return_success: Literal[True], + ) -> bool: ... - # Round up hidden size before creating moe_config. - # This way moe_config is created with the correct hidden_size from the start. - unpadded_hidden_size = hidden_size - self.model_type = ( - vllm_config.model_config.hf_config.model_type - if vllm_config.model_config is not None - else None + def weight_loader( + self, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, + shard_id: str, + expert_id: int, + return_success: bool = False, + ) -> bool | None: + if self.quant_config and self.quant_config.get_name() == "mxfp4": + # (FIXME) for gpt-oss all experts are combined + if "bias" in weight_name: + dim1 = loaded_weight.shape[1] + param.data[:, :dim1].copy_(loaded_weight) + else: + dim1 = loaded_weight.shape[1] + dim2 = loaded_weight.shape[2] + param.data[:, :dim1, :dim2].copy_(loaded_weight) + return True if return_success else None + + quant_method_name = self.quant_method.__class__.__name__ + global_expert_id = expert_id + expert_id = self.layer._map_global_expert_id_to_local_expert_id( + global_expert_id ) - hidden_size = maybe_roundup_hidden_size( - hidden_size=hidden_size, - act_dtype=moe_in_dtype, - moe_parallel_config=self.moe_parallel_config, - is_lora_enabled=vllm_config.lora_config is not None, - model_type=self.model_type, + + use_global_sf = ( + getattr(self.quant_method, "use_global_sf", False) + and "input_scale" in weight_name ) - self.hidden_size = hidden_size - self.moe_config: FusedMoEConfig = FusedMoEConfig( - num_experts=self.global_num_experts, - experts_per_token=top_k, - hidden_dim=hidden_size, - intermediate_size_per_partition=self.intermediate_size_per_partition, - num_local_experts=self.local_num_experts, - num_logical_experts=self.logical_num_experts, - moe_parallel_config=self.moe_parallel_config, - in_dtype=moe_in_dtype, - moe_backend=vllm_config.kernel_config.moe_backend, - router_logits_dtype=router_logits_dtype, - max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE, - has_bias=has_bias, - is_act_and_mul=is_act_and_mul, - is_lora_enabled=vllm_config.lora_config is not None, - activation=self.activation, - device=vllm_config.device_config.device, - routing_method=self.routing_method_type, - # TODO: in_dtype == out_dtype? - disable_inplace=disable_inplace() or shared_experts is not None, - ) + if expert_id == -1 and not use_global_sf: + # Failed to load this param since it's not local to this rank + return False if return_success else None + # Hereafter, `expert_id` is local physical id + + # is_transposed: if the dim to shard the weight + # should be flipped. Required by GPTQ, compressed-tensors + # should be whatever dimension intermediate_size_per_partition is + is_transposed = getattr(param, "is_transposed", False) + + # compressed-tensors checkpoints with packed weights are stored flipped + # TODO (mgoin): check self.layer._runner.quant_method.quant_config.quant_format + # against known CompressionFormat enum values that have this quality + if quant_method_name in ( + "CompressedTensorsWNA16MarlinMoEMethod", + "CompressedTensorsWNA16MoEMethod", + ): + if is_transposed: + loaded_weight = loaded_weight.t().contiguous() + else: + loaded_weight = loaded_weight + + if shard_id not in ("w1", "w2", "w3"): + raise ValueError(f"shard_id must be ['w1','w2','w3'] but got {shard_id}.") + + # Fetch the dim to shard the parameter/loaded weight + # based on the shard id. This will be whatever + # dimension intermediate_size_per_partition is used. + SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0} + + is_gguf_weight = getattr(param, "is_gguf_weight", False) + is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) + if is_gguf_weight_type: + param.weight_type = loaded_weight.item() + param.data.copy_(loaded_weight) + return True if return_success else None + + # Case for BitsAndBytes + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + if use_bitsandbytes_4bit: + shard_dim = 0 + + expert_data = param.data[expert_id] + if shard_id == "w2": + expert_data.copy_(loaded_weight) + elif shard_id in ("w1", "w3"): + # BNB inflight quantization has already sharded the weights + full_load = True + self._load_w13( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=self.moe_config.tp_rank, + load_full=full_load, + ) + return True if return_success else None + + shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id] + if is_transposed: + shard_dim = int(not shard_dim) + + full_load = len(loaded_weight.shape) == 3 + if full_load: + shard_dim += 1 + + # Materialize GGUF UninitializedParameter accounting merged weights + if is_gguf_weight and isinstance(param, UninitializedParameter): + # To materialize a tensor, we must have full shape including + # number of experts, making this portion to require `full_load`. + assert full_load + final_shape = list(loaded_weight.shape) + # w1 and w3 are merged per expert. + if shard_id in {"w1", "w3"}: + final_shape[1] *= 2 + final_shape[shard_dim] = final_shape[shard_dim] // self.moe_config.tp_size + param.materialize(final_shape, dtype=loaded_weight.dtype) + + expert_data = param.data if full_load else param.data[expert_id] - # Move XXXXXXXXXXXXX - if self.moe_config.use_mori_kernels: - assert self.rocm_aiter_fmoe_enabled, ( - "Mori needs to be used with aiter fused_moe for now." + # Case input scale: input_scale loading is only supported for fp8 + if "input_scale" in weight_name: + # this is needed for compressed-tensors only + loaded_weight = loaded_weight.to(param.data.device) + + if ( + "compressed" in quant_method_name.lower() + and param.data[expert_id] != 1 + and (param.data[expert_id] - loaded_weight).abs() > 1e-5 + ): + raise ValueError( + "input_scales of w1 and w3 of a layer " + f"must be equal. But got {param.data[expert_id]} " + f"vs. {loaded_weight}" + ) + + self._load_single_value( + param=param, + loaded_weight=loaded_weight, + expert_id=global_expert_id if use_global_sf else expert_id, ) - assert not self.aiter_fmoe_shared_expert_enabled, ( - "Mori does not support fusion shared expert now. " - "Turn it off by setting VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0" + return True if return_success else None + + # Case g_idx + if "g_idx" in weight_name: + self._load_g_idx( + shard_dim=0, + shard_id=shard_id, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=self.moe_config.tp_rank, ) + return True if return_success else None - self.quant_config = quant_config + # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern + if "ModelOpt" in quant_method_name: + # Determine per-tensor weight scale patterns based on variant + # Use the dedicated method instead of brittle string matching + uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern() + quant_method = getattr(param, "quant_method", None) - logger.debug("FusedMoEConfig = %s", self.moe_config) + # Call _load_per_tensor_weight_scale() to load per-tensor (scalar) + # weights scales. + # Input scales are always per-tensor. + # Weight scales: FP4 uses "weight_scale_2" and FP8 uses + # "weight_scale" for per-tensor scales. + # NOTE: ModelOpt MXFP8 MoE uses block scales in weight_scale + # tensors (quant_method=BLOCK), so those must not be treated + # as per-tensor scalars here. + is_block_weight_scale = ( + "weight_scale" in weight_name + and quant_method == FusedMoeWeightScaleSupported.BLOCK.value + ) + is_per_tensor = ( + "weight_scale_2" in weight_name + if uses_weight_scale_2 + else "weight_scale" in weight_name + ) or "input_scale" in weight_name + is_per_tensor = is_per_tensor and not is_block_weight_scale + if is_per_tensor: + self._load_per_tensor_weight_scale( + shard_id=shard_id, + param=param, + loaded_weight=loaded_weight, + expert_id=expert_id, + ) + return True if return_success else None - quant_method = self._get_quant_method( - prefix, - quant_config, - self.moe_config, - ) + # If the weight is w13_weight_scale and w13_weight_scales are + # combined into single loaded_weight, call + # _load_combined_w13_weight_scale() to load it. + # This is checked by comparing the hidden_out dims of the + # loaded_weight and the param. + if "w13_weight_scale" in weight_name: + loaded_weight_hidden_out = loaded_weight.shape[-2] + param_hidden_out = param.data.shape[-2] * self.moe_config.tp_size + if loaded_weight_hidden_out == param_hidden_out: + self._load_combined_w13_weight_scale( + shard_dim=shard_dim, + loaded_weight=loaded_weight, + param=expert_data, + tp_rank=self.moe_config.tp_rank, + ) + return True if return_success else None - # TODO(bnell): only for weight loading. how to get around this? - self.quant_method = quant_method + # For other weights, call _load_model_weight_or_group_weight_scale() + # to load it. + if "weight" in weight_name: + self._load_model_weight_or_group_weight_scale( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=self.moe_config.tp_rank, + ) + return True if return_success else None - # Move XXXXXXXXXXXXX - if not self.moe_config.is_act_and_mul and not current_platform.is_cuda_alike(): - raise NotImplementedError( - "is_act_and_mul=False is supported only for CUDA and ROCm for now" + # Case weight scales, zero_points and offset, weight/input global scales + if "scale" in weight_name or "zero" in weight_name or "offset" in weight_name: + # load the weight scales and zp based on the quantization scheme + # supported weight scales/zp can be found in + # FusedMoeWeightScaleSupported + # TODO @dsikka: once hardened, refactor to use vLLM Parameters + # specific to each case + quant_method = getattr(param, "quant_method", None) + if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value: + self._load_per_channel_weight_scale( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=self.moe_config.tp_rank, + ) + elif quant_method in [ + FusedMoeWeightScaleSupported.GROUP.value, + FusedMoeWeightScaleSupported.BLOCK.value, + ]: + self._load_model_weight_or_group_weight_scale( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=self.moe_config.tp_rank, + load_full_w2=getattr(param, "load_full_w2", False), + ) + elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value: + self._load_per_tensor_weight_scale( + shard_id=shard_id, + param=param, + loaded_weight=loaded_weight, + expert_id=expert_id, + ) + else: + WEIGHT_SCALE_SUPPORTED = [e.value for e in FusedMoeWeightScaleSupported] + raise ValueError( + f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}" + ) + return True if return_success else None + + # Case weight_shape + if "weight_shape" in weight_name: + # only required by compressed-tensors + self._load_single_value( + param=param, loaded_weight=loaded_weight, expert_id=expert_id ) + return True if return_success else None - # Move XXXXXXXXXXXXX - if eplb_manager is not None and not quant_method.supports_eplb: - # TODO: Add support for additional quantization methods. - # The implementation for other quantization methods does not - # contain essential differences, but the current quant API - # design causes duplicated work when extending to new - # quantization methods, so I'm leaving it for now. - # If you plan to add support for more quantization methods, - # please refer to the implementation in `Fp8MoEMethod`. - raise NotImplementedError( - f"EPLB is not supported {quant_method.__class__.__name__}." + # Case model weights + if "weight" in weight_name: + self._load_model_weight_or_group_weight_scale( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=self.moe_config.tp_rank, + ) + return True if return_success else None + + return False if return_success else None + + def load_weights( + self, weights: Iterable[tuple[str, torch.Tensor]] + ) -> Iterable[str]: + if (expert_mapping := self.layer.expert_mapping) is None: + raise ValueError( + "`self.layer.expert_mapping` must be provided to " + "load weights using `self.load_weights`." + ) + for expert_name, loaded_weight in weights: + qual_name = f"{self.layer_name}.{expert_name}" + for param_name, weight_name, expert_id, shard_id in expert_mapping: + if weight_name not in qual_name: + continue + weight_name = qual_name.replace(weight_name, param_name) + param_name = weight_name.removeprefix(f"{self.layer_name}.") + param = getattr(self, param_name) + # Fused expert weights can be identified by their 3D tensors + if loaded_weight.dim() == 3: + # Repurpose expert_id as shard_idx for deconcatenating w1 and w3 + if shard_id in {"w1", "w3"}: + shard_idx = expert_id + experts_shard = loaded_weight.chunk(2, dim=1)[shard_idx] + else: + experts_shard = loaded_weight + start = 0 + else: + # loaded_weight is a single expert weight, so we add a dummy expert + # dimension to unify the loading logic with the fused case + experts_shard = loaded_weight.unsqueeze(0) + start = expert_id + + # Unified loading logic for fused and non-fused experts + loaded_experts = experts_shard.unbind() + for expert_id, loaded_expert in enumerate(loaded_experts, start=start): + success = self.weight_loader( + param=param, + loaded_weight=loaded_expert, + weight_name=weight_name, + shard_id=shard_id, + expert_id=expert_id, + return_success=True, + ) + if success: + logger.debug( + "Loaded expert %d of shard %s into %s for layer %s", + expert_id, + shard_id, + param_name, + self.layer_name, + ) + yield param_name + + # + # Execution + # + + def forward( + self, + x: torch.Tensor, + topk_weights: torch.Tensor | None = None, + topk_ids: torch.Tensor | None = None, + router_logits: torch.Tensor | None = None, + shared_experts_input: torch.Tensor | None = None, + ) -> torch.Tensor: + """ + Execute routed experts using the quantization method's apply function. + + This is called by the runner after router selection (for modular kernels) + or with router logits (for monolithic kernels). It delegates to + quant_method.apply() which accesses the weights on this RoutedExperts + instance. + + Args: + x: Input tensor after any transforms + topk_weights: Routing weights from router (for modular kernels) + topk_ids: Selected expert IDs from router (for modular kernels) + router_logits: Router logits (for monolithic kernels) + shared_experts_input: Input for shared experts (if any) + + Returns: + Output tensor from routed experts + """ + quant_method = self.quant_method + + if quant_method.is_monolithic: + # Monolithic kernels handle routing internally + return quant_method.apply_monolithic( + layer=self, # Pass RoutedExperts as layer + x=x, + router_logits=router_logits, + ) + else: + # Modular kernels use pre-computed routing + return quant_method.apply( + layer=self, # Pass RoutedExperts as layer + x=x, + topk_weights=topk_weights, + topk_ids=topk_ids, + shared_experts_input=shared_experts_input, ) - moe_quant_params = { - "num_experts": self.local_num_experts, - "hidden_size": hidden_size, - "unpadded_hidden_size": unpadded_hidden_size, - "intermediate_size_per_partition": self.intermediate_size_per_partition, - "params_dtype": params_dtype, - "weight_loader": self.weight_loader, - "global_num_experts": self.global_num_experts, - } - - # need full intermediate size pre-sharding for WNA16 act order - if self._needs_intermediate_size_param(quant_method): - moe_quant_params["intermediate_size_full"] = intermediate_size - quant_method.create_weights(layer=self, **moe_quant_params) +# Mark the RoutedExperts weight_loader as supporting MoE-specific parameters +RoutedExperts.weight_loader.supports_moe_loading = True # type: ignore[attr-defined] - # Storing the runner in the FusedMoE is an intermediate state, eventually - # the runner will own the FusedMoE layer and provide the execution interface - # for MoE ops. - self._runner = create_moe_runner( - layer=self, - moe_config=self.moe_config, - router=router, - routed_input_transform=routed_input_transform, - routed_output_transform=routed_output_transform, - gate=gate, - shared_experts=shared_experts, - quant_method=quant_method, - enable_dbo=vllm_config.parallel_config.enable_dbo, - apply_scale_to_output=apply_scale_to_output, - routed_scaling_factor=routed_scaling_factor, - ) - # TODO(bnell): make this a method on quant_method - def _needs_intermediate_size_param(self, quant_method: FusedMoEMethodBase) -> bool: - return quant_method.__class__.__name__ in ( - "GPTQMarlinMoEMethod", - "CompressedTensorsWNA16MarlinMoEMethod", - "CompressedTensorsWNA16MoEMethod", - ) +# --8<-- [start:fused_moe] +@CustomOp.register("fused_moe") +class FusedMoE(CustomOp): + """FusedMoE layer for MoE models. - def extra_repr(self) -> str: - s = ( - f"global_num_experts={self.global_num_experts}, " - f"local_num_experts={self.local_num_experts}, " - f"top_k={self.top_k}, " - f"intermediate_size_per_partition={self.intermediate_size_per_partition}, " # noqa: E501 - f"tp_size={self.tp_size},\n" - f"ep_size={self.ep_size}, " - ) + This layer contains both MergedColumnParallel weights (gate_up_proj / + w13) and RowParallelLinear weights (down_proj/ w2). - return s + Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We + copy that naming convention here and handle any remapping in the + load_weights function in each model implementation. - def _get_quant_method( - self, - prefix: str, - quant_config: QuantizationConfig | None, - moe_config: FusedMoEConfig, - ) -> FusedMoEMethodBase: - """ - Helper method to ensure quant_method is never None and - of the proper type. - """ - quant_method = None - if quant_config is not None: - quant_method = quant_config.get_quant_method(self, prefix) - if quant_method is None: - quant_method = UnquantizedFusedMoEMethod(moe_config) - assert isinstance(quant_method, FusedMoEMethodBase) - return quant_method + Args: + num_experts: Number of experts in the model + top_k: Number of experts selected for each token + hidden_size: Input hidden state size of the transformer + intermediate_size: Intermediate size of the experts + params_dtype: Data type for the parameters. + renormalize: Whether to renormalize the logits in the fused_moe kernel + quant_config: Quantization configure. + enable_eplb: Whether to enable expert parallelism load balancer. + router_logits_dtype: Data type for router logits buffers. + """ - # TODO(bnell): This method is provided as a hook so vllm/lora/layers/fused_moe.py - # and vllm/distributed/elastic_ep/elastic_execute.py - # can safely swap out the quant_method. We should figure out a less - # intrusive way to do this. - def _replace_quant_method(self, mk: FusedMoEMethodBase): - self._runner._replace_quant_method(mk) + # --8<-- [end:fused_moe] - def _ensure_moe_quant_config_init(self): - if self._runner.quant_method.moe_quant_config is None: - # Note: the moe_quant_config can't be constructed until after - # weight loading post processing. - self._runner.quant_method.moe_quant_config = ( - self._runner.quant_method.get_fused_moe_quant_config(self) - ) + def __init__( + self, + num_experts: int, # Global number of experts + top_k: int, + hidden_size: int, + intermediate_size: int, + params_dtype: torch.dtype | None = None, + renormalize: bool = True, + use_grouped_topk: bool = False, + num_expert_group: int | None = None, + topk_group: int | None = None, + quant_config: QuantizationConfig | None = None, + tp_size: int | None = None, + ep_size: int | None = None, + dp_size: int | None = None, + pcp_size: int | None = None, + prefix: str = "", + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + is_act_and_mul: bool = True, + enable_eplb: bool = False, + num_redundant_experts: int = 0, + has_bias: bool = False, + is_sequence_parallel=False, + expert_mapping: list[tuple[str, str, int, str]] | None = None, + n_shared_experts: int | None = None, + router_logits_dtype: torch.dtype | None = None, + gate: torch.nn.Module | None = None, + shared_experts: torch.nn.Module | None = None, + routed_input_transform: torch.nn.Module | None = None, + routed_output_transform: torch.nn.Module | None = None, + apply_scale_to_output: bool = False, + zero_expert_type: str | None = None, + ): + super().__init__() - # Note: maybe_init_modular_kernel should only be called by - # prepare_communication_buffer_for_model. - # This is called after all weight loading and post-processing, so it - # should be safe to swap out the quant_method. - def maybe_init_modular_kernel(self) -> None: - # NOTE(rob): WIP refactor. For quant methods that own the MK - # we create the MK during process_weights_after_loading. - if ( - self._runner.quant_method.supports_internal_mk - or self._runner.quant_method.is_monolithic - ): - return None + if params_dtype is None: + params_dtype = torch.get_default_dtype() + self.params_dtype = params_dtype - self._ensure_moe_quant_config_init() - # routing_tables only needed for round-robin expert placement with - # DeepEP all2all backend. - routing_tables = self._maybe_init_expert_routing_tables() + vllm_config = get_current_vllm_config() - if isinstance(self._runner.quant_method, FusedMoEModularMethod): - base_quant_method = self._runner.quant_method.old_quant_method + # FIXME (varun): We should have a better way of inferring the activation + # datatype. This works for now as the tensor datatype entering the MoE + # operation is typically unquantized (i.e. float16/bfloat16). + if vllm_config.model_config is not None: + moe_in_dtype = vllm_config.model_config.dtype else: - base_quant_method = self._runner.quant_method + # TODO (bnell): This is a hack to get test_mixtral_moe to work + # since model_config is not set in the pytest test. + moe_in_dtype = params_dtype - prepare_finalize = base_quant_method.maybe_make_prepare_finalize( - routing_tables=routing_tables + tp_size_ = ( + tp_size if tp_size is not None else get_tensor_model_parallel_world_size() ) - if prepare_finalize is not None: - logger.debug( - "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self) - ) - self._replace_quant_method( - FusedMoEModularMethod.make( - self, - base_quant_method, - prepare_finalize, - self.shared_experts, - inplace=not self.moe_config.disable_inplace, - ) - ) + dp_size_ = dp_size if dp_size is not None else get_dp_group().world_size + pcp_size_ = pcp_size if pcp_size is not None else get_pcp_group().world_size - # - # Properties - # + self.is_sequence_parallel = is_sequence_parallel + self.sp_size = tp_size_ if is_sequence_parallel else 1 - @property - def layer_id(self): - # Delayed import to avoid circular dependency - from vllm.model_executor.models.utils import extract_layer_index + self.moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make( + tp_size_=tp_size_, + pcp_size_=pcp_size_, + dp_size_=dp_size_, + sp_size_=self.sp_size, + vllm_parallel_config=vllm_config.parallel_config, + ) - return extract_layer_index(self.layer_name) + assert self.moe_parallel_config.is_sequence_parallel == is_sequence_parallel - @property - def tp_size(self): - return self.moe_parallel_config.tp_size + logger.debug("FusedMoEParallelConfig = %s", str(self.moe_parallel_config)) - @property - def ep_size(self): - return self.moe_parallel_config.ep_size + self.global_num_experts = num_experts + num_redundant_experts + self.logical_num_experts = num_experts - @property - def tp_rank(self): - return self.moe_parallel_config.tp_rank + # Initialize EPLB manager (or None?) + eplb_manager: EplbManager | None = None + if enable_eplb: + eplb_manager = EplbManager(num_redundant_experts=num_redundant_experts) - @property - def ep_rank(self): - return self.moe_parallel_config.ep_rank + # Expert mapping used in self.load_weights + self.expert_mapping = expert_mapping - @property - def use_ep(self): - return self.moe_parallel_config.use_ep + # For smuggling this layer into the fused moe custom op + compilation_config = vllm_config.compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError("Duplicate layer name: {}".format(prefix)) + compilation_config.static_forward_context[prefix] = self + compilation_config.static_all_moe_layers.append(prefix) + self.layer_name = prefix + expert_placement_strategy: ExpertPlacementStrategy = ( + vllm_config.parallel_config.expert_placement_strategy + ) - # XXXXXXXXX keep this separate - @property - def local_num_experts(self) -> int: - """Number of experts assigned to this rank.""" - return self.expert_map_manager.local_num_experts + # ROCm aiter shared experts fusion + # AITER only supports gated activations (silu/gelu), so disable it + # for non-gated MoE (is_act_and_mul=False) + self.rocm_aiter_fmoe_enabled = ( + rocm_aiter_ops.is_fused_moe_enabled() and is_act_and_mul + ) + self.aiter_fmoe_shared_expert_enabled = ( + rocm_aiter_ops.is_fusion_moe_shared_experts_enabled() and is_act_and_mul + ) + + self.num_fused_shared_experts = ( + n_shared_experts + if n_shared_experts is not None and self.aiter_fmoe_shared_expert_enabled + else 0 + ) + if ( + not self.aiter_fmoe_shared_expert_enabled + and self.num_fused_shared_experts != 0 + ): + raise ValueError( + "n_shared_experts is only supported on ROCm aiter when " + "VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS is enabled" + ) - @property - def expert_placement_strategy(self) -> ExpertPlacementStrategy: - """Expert placement strategy ('linear' or 'round_robin').""" - return self.expert_map_manager.placement_strategy + # Determine expert maps + if self.use_ep: + if eplb_manager is not None: + # Validate EPLB configuration + eplb_manager.validate_configuration( + self.global_num_experts, self.ep_size + ) + else: + assert num_redundant_experts == 0, ( + "Redundant experts are only supported with EPLB." + ) - @property - def expert_global_to_physical(self) -> torch.Tensor | None: - """Routing table: global expert ID to physical expert ID.""" - tables = self.expert_map_manager.routing_tables - return tables[0] if tables else None + # Determine expert placement strategy before creating manager + expert_placement_strategy_effective = determine_expert_placement_strategy( + expert_placement_strategy=expert_placement_strategy, + moe_parallel_config=self.moe_parallel_config, + num_expert_group=num_expert_group, + num_redundant_experts=num_redundant_experts, + enable_eplb=eplb_manager is not None, + ) + else: + expert_placement_strategy_effective = expert_placement_strategy - @property - def expert_physical_to_global(self) -> torch.Tensor | None: - """Routing table: physical expert ID to global expert ID.""" - tables = self.expert_map_manager.routing_tables - return tables[1] if tables else None + # Create expert map manager + self.expert_map_manager = ExpertMapManager( + global_num_experts=self.global_num_experts, + logical_num_experts=self.logical_num_experts, + moe_parallel_config=self.moe_parallel_config, + placement_strategy=expert_placement_strategy_effective, + num_fused_shared_experts=self.num_fused_shared_experts, + rocm_aiter_enabled=self.rocm_aiter_fmoe_enabled, + device=vllm_config.device_config.device, + ) - @property - def expert_local_to_global(self) -> torch.Tensor | None: - """Routing table: local expert ID to global expert ID.""" - tables = self.expert_map_manager.routing_tables - return tables[2] if tables else None + # Register buffers for state_dict compatibility + # if self.expert_map_manager.expert_map is not None: + # self.register_buffer("_expert_map", self.expert_map_manager.expert_map) + # + # if self.expert_map_manager.expert_mask is not None: + # self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) - @property - def is_internal_router(self) -> bool: - # By default, router/gate is called before FusedMoE forward pass - return self._runner.is_internal_router + # Log EP configuration (move into EMM?) + if self.use_ep: + logger.info_once( + "[EP Rank %s/%s] Expert parallelism is enabled. Expert " + "placement strategy: %s. Local/global" + " number of experts: %s/%s. Experts local to global index map:" + " %s.", + self.ep_rank, + self.ep_size, + self.expert_map_manager.placement_strategy, + self.expert_map_manager.local_num_experts, + self.expert_map_manager.global_num_experts, + self.expert_map_manager.get_compressed_map_string(), + ) - @property - def is_monolithic(self) -> bool: - return self._runner.quant_method.is_monolithic + self.top_k = top_k - @property - def shared_experts(self) -> SharedExperts | None: - return self._runner.shared_experts + # move into EMM? + self._init_aiter_shared_experts_topK_buffer( + vllm_config=vllm_config, dp_size=dp_size_ + ) - # - # Expert maps - # + # XXXXX move into EMM (this is just an assert) + if self.use_ep and self.rocm_aiter_fmoe_enabled: + expert_mask = self.expert_map_manager.expert_mask + assert expert_mask is None or torch.all( + (expert_mask == 0) | (expert_mask == 1) + ), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s." - @property - def expert_map(self) -> torch.Tensor | None: - return ( - self.expert_map_manager.expert_map - if not self.rocm_aiter_fmoe_enabled - else self.expert_map_manager.expert_mask - ) + assert intermediate_size % self.tp_size == 0 + self.intermediate_size_per_partition = intermediate_size // self.tp_size + self.renormalize = renormalize - def _maybe_init_expert_routing_tables( - self, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: - """Get routing tables (already initialized by manager).""" - # Return routing tables from manager - routing_tables = self.expert_map_manager.routing_tables + # TODO(bnell): these attributes are only used by monolithic kernels. + # Put them in a MoERouterConfig dataclass? + self.use_grouped_topk = use_grouped_topk + if self.use_grouped_topk: + assert num_expert_group is not None and topk_group is not None + self.num_expert_group = num_expert_group + self.topk_group = topk_group + self.custom_routing_function = custom_routing_function + self.scoring_func = scoring_func + self.routed_scaling_factor = routed_scaling_factor + self.e_score_correction_bias = e_score_correction_bias + # TODO(bnell): end attributes - if routing_tables is None: - return None + # Store in runner? + self.apply_router_weight_on_input = apply_router_weight_on_input + self.activation = MoEActivation.from_str(activation) - # Register buffers for backward compatibility if not already registered - if not hasattr(self, "expert_global_to_physical"): - global_to_physical, physical_to_global, local_global = routing_tables - self.register_buffer("expert_global_to_physical", global_to_physical) - self.register_buffer("expert_physical_to_global", physical_to_global) - self.register_buffer("expert_local_to_global", local_global) + self._runner: MoERunner - return routing_tables + # TODO(bnell): we should not have to create a router if the kernel is + # monolithic. + router = create_fused_moe_router( + top_k=top_k, + global_num_experts=self.global_num_experts, + renormalize=renormalize, + use_grouped_topk=use_grouped_topk, + num_expert_group=num_expert_group, + topk_group=topk_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor + if not apply_scale_to_output + else 1.0, + e_score_correction_bias=e_score_correction_bias, + num_fused_shared_experts=self.num_fused_shared_experts, + eplb_manager=eplb_manager, + # TODO(bnell): once we can construct the MK at init time, we + # can make this a value. + indices_type_getter=lambda: self._runner.quant_method.topk_indices_dtype, + zero_expert_type=zero_expert_type, + num_logical_experts=self.logical_num_experts, + ) + self.routing_method_type: RoutingMethodType = router.routing_method_type - def update_expert_map(self): - """Update expert mappings for new EP configuration.""" - # ep_size and ep_rank should already be updated in moe_parallel_config - self.expert_map_manager.update() + # TODO(bnell): is this redundant now? + # When using zero experts, slice e_score_correction_bias to cover + # only real experts, for compatibility with monolithic kernels that + # read it directly. + if zero_expert_type is not None and e_score_correction_bias is not None: + self.e_score_correction_bias = e_score_correction_bias[ + : self.logical_num_experts + ] - # Re-register buffers for state_dict compatibility - self.register_buffer("_expert_map", self.expert_map_manager.expert_map) - self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) + # Round up hidden size before creating moe_config. + # This way moe_config is created with the correct hidden_size from the start. + unpadded_hidden_size = hidden_size + self.model_type = ( + vllm_config.model_config.hf_config.model_type + if vllm_config.model_config is not None + else None + ) + hidden_size = maybe_roundup_hidden_size( + hidden_size=hidden_size, + act_dtype=moe_in_dtype, + moe_parallel_config=self.moe_parallel_config, + is_lora_enabled=vllm_config.lora_config is not None, + model_type=self.model_type, + ) + self.hidden_size = hidden_size - # Update routing table buffers if needed - self._maybe_init_expert_routing_tables() + self.moe_config: FusedMoEConfig = FusedMoEConfig( + num_experts=self.global_num_experts, + experts_per_token=top_k, + hidden_dim=hidden_size, + intermediate_size_per_partition=self.intermediate_size_per_partition, + num_local_experts=self.local_num_experts, + num_logical_experts=self.logical_num_experts, + moe_parallel_config=self.moe_parallel_config, + in_dtype=moe_in_dtype, + moe_backend=vllm_config.kernel_config.moe_backend, + router_logits_dtype=router_logits_dtype, + max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE, + has_bias=has_bias, + is_act_and_mul=is_act_and_mul, + is_lora_enabled=vllm_config.lora_config is not None, + activation=self.activation, + device=vllm_config.device_config.device, + routing_method=self.routing_method_type, + # TODO: in_dtype == out_dtype? + disable_inplace=disable_inplace() or shared_experts is not None, + ) - # Handle AITER shared experts if needed - if self.aiter_fmoe_shared_expert_enabled: - self._init_aiter_shared_experts_topK_buffer( - vllm_config=get_current_vllm_config(), - dp_size=get_dp_group().world_size, + # Move XXXXXXXXXXXXX + if self.moe_config.use_mori_kernels: + assert self.rocm_aiter_fmoe_enabled, ( + "Mori needs to be used with aiter fused_moe for now." + ) + assert not self.aiter_fmoe_shared_expert_enabled, ( + "Mori does not support fusion shared expert now. " + "Turn it off by setting VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0" ) - def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: - """Map global expert ID to local expert ID.""" - return self.expert_map_manager.map_global_to_local(expert_id) + self.quant_config = quant_config - # - # EPLB - # + logger.debug("FusedMoEConfig = %s", self.moe_config) - def _init_aiter_shared_experts_topK_buffer( - self, vllm_config: VllmConfig, dp_size: int - ): - if self.num_fused_shared_experts > 0: - init_aiter_topK_meta_data( - n_routed_experts=self.global_num_experts, - n_shared_experts=self.num_fused_shared_experts, - top_k=self.top_k, - tp_rank=self.ep_rank if self.use_ep else self.tp_rank, - tp_size=self.ep_size if self.use_ep else self.tp_size, - shared_experts_score=1.0, - max_num_tokens=vllm_config.scheduler_config.max_num_batched_tokens - * dp_size, - is_EP=self.use_ep, - ) - # HACK - self.expert_map_manager._local_num_experts += self.num_fused_shared_experts + quant_method = self._get_quant_method( + prefix, + quant_config, + self.moe_config, + ) - def get_expert_weights(self) -> Iterable[torch.Tensor]: - """Delegate to EPLB manager.""" - if self._runner.router.eplb_manager is not None: - return self._runner.router.eplb_manager.get_expert_weights(self) - else: - return [] + # TODO(bnell): only for weight loading. how to get around this? + self.quant_method = quant_method - def set_eplb_state( - self, - moe_layer_idx: int, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - """ - Register the EPLB state in this layer. + # Move XXXXXXXXXXXXX + if not self.moe_config.is_act_and_mul and not current_platform.is_cuda_alike(): + raise NotImplementedError( + "is_act_and_mul=False is supported only for CUDA and ROCm for now" + ) - This is used later in forward pass, where we get the expert mapping - and record the load metrics in `expert_load_view`. - """ - if self._runner.router.eplb_manager is not None: - self._runner.router.eplb_manager.set_state( - moe_layer_idx, - expert_load_view, - logical_to_physical_map, - logical_replica_count, + # Move XXXXXXXXXXXXX + if eplb_manager is not None and not quant_method.supports_eplb: + # TODO: Add support for additional quantization methods. + # The implementation for other quantization methods does not + # contain essential differences, but the current quant API + # design causes duplicated work when extending to new + # quantization methods, so I'm leaving it for now. + # If you plan to add support for more quantization methods, + # please refer to the implementation in `Fp8MoEMethod`. + raise NotImplementedError( + f"EPLB is not supported {quant_method.__class__.__name__}." ) - @classmethod - def make_expert_params_mapping( - cls, - model: torch.nn.Module, - ckpt_gate_proj_name: str, - ckpt_down_proj_name: str, - ckpt_up_proj_name: str, - num_experts: int, - num_redundant_experts: int = 0, - ) -> list[tuple[str, str, int, str]]: - """Delegate to EPLB manager.""" - return EplbManager.make_expert_params_mapping( - model, - ckpt_gate_proj_name, - ckpt_down_proj_name, - ckpt_up_proj_name, - num_experts, - num_redundant_experts, + # Storing the runner in the FusedMoE is an intermediate state, eventually + # the runner will own the FusedMoE layer and provide the execution interface + # for MoE ops. + self._runner = create_moe_runner( + layer=self, + moe_config=self.moe_config, + router=router, + routed_input_transform=routed_input_transform, + routed_output_transform=routed_output_transform, + gate=gate, + shared_experts=shared_experts, + quant_method=quant_method, + enable_dbo=vllm_config.parallel_config.enable_dbo, + apply_scale_to_output=apply_scale_to_output, + routed_scaling_factor=routed_scaling_factor, ) - # - # Weight Loading - # - - def _load_per_tensor_weight_scale( - self, - shard_id: str, - param: torch.nn.Parameter, - loaded_weight: torch.Tensor, - expert_id: int, - ): - param_data = param.data - # for per tensor weight quantization - if shard_id in ("w1", "w3"): - # We have to keep the weight scales of w1 and w3 because - # we need to re-quantize w1/w3 weights after weight loading. - idx = 0 if shard_id == "w1" else 1 - param_data[expert_id][idx] = loaded_weight - # If we are in the row parallel case (down_proj) - elif shard_id == "w2": - param_data[expert_id] = loaded_weight + # Create RoutedExperts instance BEFORE create_weights() + # This will hold all expert weight parameters + self.routed_experts = RoutedExperts( + self.layer_name, + params_dtype, + unpadded_hidden_size, + intermediate_size, + self.moe_config, + self.quant_config, + self.quant_method, + expert_map_manager=self.expert_map_manager, + # Extra params that are needed by quant_methods, pass along for now + rocm_aiter_fmoe_enabled=self.rocm_aiter_fmoe_enabled, + use_grouped_topk=use_grouped_topk, + num_expert_group=num_expert_group, + topk_group=topk_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + apply_router_weight_on_input=apply_router_weight_on_input, + activation=MoEActivation.from_str(activation), + # XXXXXXXXXXXXXXXXXXXXXXXX + shared_experts=self._runner.shared_experts, + ) - def _load_combined_w13_weight_scale( - self, - shard_dim: int, - loaded_weight: torch.Tensor, - param: torch.Tensor, - tp_rank: int, - ): - """ - Load w13 weight scales assuming that w1 weight scales and w3 weight - scales are stored in the same loaded_weight tensor. - """ - shard_size = param.shape[shard_dim] - loaded_weight = loaded_weight.narrow( - shard_dim, shard_size * tp_rank, shard_size + def extra_repr(self) -> str: + s = ( + f"global_num_experts={self.global_num_experts}, " + f"local_num_experts={self.local_num_experts}, " + f"top_k={self.top_k}, " + f"intermediate_size_per_partition={self.intermediate_size_per_partition}, " # noqa: E501 + f"tp_size={self.tp_size},\n" + f"ep_size={self.ep_size}, " ) - param.copy_(loaded_weight) - def _load_model_weight_or_group_weight_scale( + return s + + def _get_quant_method( self, - shard_dim: int, - expert_data: torch.Tensor, - shard_id: str, - loaded_weight: torch.Tensor, - tp_rank: int, - load_full_w2: bool = False, - ): + prefix: str, + quant_config: QuantizationConfig | None, + moe_config: FusedMoEConfig, + ) -> FusedMoEMethodBase: """ - Load grouped weight scales for group quantization or model weights - :param shard_dim: dimension to shard - :param expert_data: parameter for a particular expert - :param shard_id: either w1, w2, or w3 - :param loaded_weight: checkpoint weight to load into the param - :param tp_rank: tensor parallel rank - :param load_full_w2: whether or not the w2 loaded should be sharded. + Helper method to ensure quant_method is never None and + of the proper type. """ - if shard_id == "w2": - # In the case where we have actorder/g_idx, we do not partition the - # w2 scales, as indicated by `load_full` argument, for all tp cases - self._load_w2( - shard_dim=shard_dim, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=tp_rank, - load_full=load_full_w2, - ) - elif shard_id in ("w1", "w3"): - self._load_w13( - shard_id=shard_id, - shard_dim=shard_dim, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=tp_rank, - ) + quant_method = None + if quant_config is not None: + quant_method = quant_config.get_quant_method(self, prefix) + if quant_method is None: + quant_method = UnquantizedFusedMoEMethod(moe_config) + assert isinstance(quant_method, FusedMoEMethodBase) + return quant_method - def _load_per_channel_weight_scale( - self, - expert_data: torch.Tensor, - shard_dim: int, - shard_id: str, - loaded_weight: torch.Tensor, - tp_rank: int, - ): - # for per channel weight quantization - if shard_id == "w2": - expert_data.copy_(loaded_weight) - elif shard_id in ("w1", "w3"): - self._load_w13( - shard_id=shard_id, - shard_dim=shard_dim, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=tp_rank, - ) + # TODO(bnell): This method is provided as a hook so vllm/lora/layers/fused_moe.py + # and vllm/distributed/elastic_ep/elastic_execute.py + # can safely swap out the quant_method. We should figure out a less + # intrusive way to do this. + def _replace_quant_method(self, mk: FusedMoEMethodBase): + self._runner._replace_quant_method(mk) + self.routed_experts.quant_method = mk - def _load_w13( - self, - expert_data: torch.Tensor, - shard_dim: int, - shard_id: str, - loaded_weight: torch.Tensor, - tp_rank: int, - load_full: bool = False, - ): - # Index the loaded weight for tp sharding. - # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim - if self.moe_config.is_act_and_mul: - shard_size = expert_data.shape[shard_dim] // 2 - else: - shard_size = expert_data.shape[shard_dim] - # Only narrow if the loaded_weight is not a scalar (0-dim tensor) - # and we're not loading the full weight - if not load_full and loaded_weight.ndim > 0: - loaded_weight = loaded_weight.narrow( - shard_dim, shard_size * tp_rank, shard_size - ) - # Narrow parameter and load. - # w1, gate_proj: Load into first logical weight of w13. - if shard_id == "w1": - expert_data = expert_data.narrow(shard_dim, 0, shard_size) - # w3, up_proj: Load into second logical weight of w13. - else: - assert shard_id == "w3" - expert_data = expert_data.narrow(shard_dim, shard_size, shard_size) - expert_data.copy_(loaded_weight) + # def _ensure_moe_quant_config_init(self): + # if self._runner.quant_method.moe_quant_config is None: + # # Note: the moe_quant_config can't be constructed until after + # # weight loading post processing. + # self._runner.quant_method.moe_quant_config = ( + # self._runner.quant_method.get_fused_moe_quant_config(self) + # ) - def _load_w2( - self, - expert_data: torch.Tensor, - shard_dim: int, - loaded_weight: torch.Tensor, - tp_rank: int, - load_full: bool = False, - ): - # Index the loaded weight for tp sharding. - # down_proj: "RowParallel" so tp sharding on input_dim - # Narrow parameter and load. - shard_size = expert_data.shape[shard_dim] - # Only narrow if the loaded_weight is not a scalar (0-dim tensor) - # and we're not loading the full weight - if not load_full and loaded_weight.ndim > 0: - loaded_weight = loaded_weight.narrow( - shard_dim, shard_size * tp_rank, shard_size - ) - # w2, down_proj: Load into only logical weight of w2. - expert_data.copy_(loaded_weight) + # Note: maybe_init_modular_kernel should only be called by + # prepare_communication_buffer_for_model. + # This is called after all weight loading and post-processing, so it + # should be safe to swap out the quant_method. + def maybe_init_modular_kernel(self) -> None: + # NOTE(rob): WIP refactor. For quant methods that own the MK + # we create the MK during process_weights_after_loading. + if ( + self._runner.quant_method.supports_internal_mk + or self._runner.quant_method.is_monolithic + ): + return None - def _load_single_value( - self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int - ): - param_data = param.data + self.routed_experts._ensure_moe_quant_config_init() + # routing_tables only needed for round-robin expert placement with + # DeepEP all2all backend. + routing_tables = self._maybe_init_expert_routing_tables() - # Input scales can be loaded directly and should be equal. - param_data[expert_id] = loaded_weight + if isinstance(self._runner.quant_method, FusedMoEModularMethod): + base_quant_method = self._runner.quant_method.old_quant_method + else: + base_quant_method = self._runner.quant_method - def _load_g_idx( - self, - shard_id: str, - expert_data: torch.Tensor, - shard_dim: int, - loaded_weight: torch.Tensor, - tp_rank: int, - ): - if shard_id == "w2": - self._load_w2( - shard_dim=shard_dim, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=tp_rank, + prepare_finalize = base_quant_method.maybe_make_prepare_finalize( + routing_tables=routing_tables + ) + if prepare_finalize is not None: + logger.debug( + "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self) + ) + self._replace_quant_method( + FusedMoEModularMethod.make( + self, + base_quant_method, + prepare_finalize, + self.shared_experts, + inplace=not self.moe_config.disable_inplace, + ) ) - else: - assert shard_id in ("w1", "w3") - expert_data.copy_(loaded_weight) - @overload - def weight_loader( - self, - param: torch.nn.Parameter, - loaded_weight: torch.Tensor, - weight_name: str, - shard_id: str, - expert_id: int, - return_success: Literal[False], - ) -> None: ... + # + # Properties + # - @overload - def weight_loader( - self, - param: torch.nn.Parameter, - loaded_weight: torch.Tensor, - weight_name: str, - shard_id: str, - expert_id: int, - return_success: Literal[True], - ) -> bool: ... + @property + def layer_id(self): + # Delayed import to avoid circular dependency + from vllm.model_executor.models.utils import extract_layer_index - def weight_loader( - self, - param: torch.nn.Parameter, - loaded_weight: torch.Tensor, - weight_name: str, - shard_id: str, - expert_id: int, - return_success: bool = False, - ) -> bool | None: - if self.quant_config and self.quant_config.get_name() == "mxfp4": - # (FIXME) for gpt-oss all experts are combined - if "bias" in weight_name: - dim1 = loaded_weight.shape[1] - param.data[:, :dim1].copy_(loaded_weight) - else: - dim1 = loaded_weight.shape[1] - dim2 = loaded_weight.shape[2] - param.data[:, :dim1, :dim2].copy_(loaded_weight) - return True if return_success else None + return extract_layer_index(self.layer_name) - quant_method_name = self._runner.quant_method.__class__.__name__ - global_expert_id = expert_id - expert_id = self._map_global_expert_id_to_local_expert_id(global_expert_id) + @property + def tp_size(self): + return self.moe_parallel_config.tp_size - use_global_sf = ( - getattr(self._runner.quant_method, "use_global_sf", False) - and "input_scale" in weight_name - ) + @property + def ep_size(self): + return self.moe_parallel_config.ep_size - if expert_id == -1 and not use_global_sf: - # Failed to load this param since it's not local to this rank - return False if return_success else None - # Hereafter, `expert_id` is local physical id + @property + def tp_rank(self): + return self.moe_parallel_config.tp_rank - # is_transposed: if the dim to shard the weight - # should be flipped. Required by GPTQ, compressed-tensors - # should be whatever dimension intermediate_size_per_partition is - is_transposed = getattr(param, "is_transposed", False) + @property + def ep_rank(self): + return self.moe_parallel_config.ep_rank - # compressed-tensors checkpoints with packed weights are stored flipped - # TODO (mgoin): check self._runner.quant_method.quant_config.quant_format - # against known CompressionFormat enum values that have this quality - if quant_method_name in ( - "CompressedTensorsWNA16MarlinMoEMethod", - "CompressedTensorsWNA16MoEMethod", - ): - if is_transposed: - loaded_weight = loaded_weight.t().contiguous() - else: - loaded_weight = loaded_weight + @property + def use_ep(self): + return self.moe_parallel_config.use_ep - if shard_id not in ("w1", "w2", "w3"): - raise ValueError(f"shard_id must be ['w1','w2','w3'] but got {shard_id}.") + # XXXXXXXXX keep this separate + @property + def local_num_experts(self) -> int: + """Number of experts assigned to this rank.""" + return self.expert_map_manager.local_num_experts - # Fetch the dim to shard the parameter/loaded weight - # based on the shard id. This will be whatever - # dimension intermediate_size_per_partition is used. - SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0} + @property + def expert_placement_strategy(self) -> ExpertPlacementStrategy: + """Expert placement strategy ('linear' or 'round_robin').""" + return self.expert_map_manager.placement_strategy - is_gguf_weight = getattr(param, "is_gguf_weight", False) - is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) - if is_gguf_weight_type: - param.weight_type = loaded_weight.item() - param.data.copy_(loaded_weight) - return True if return_success else None + @property + def expert_global_to_physical(self) -> torch.Tensor | None: + """Routing table: global expert ID to physical expert ID.""" + tables = self.expert_map_manager.routing_tables + return tables[0] if tables else None - # Case for BitsAndBytes - use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) - if use_bitsandbytes_4bit: - shard_dim = 0 + @property + def expert_physical_to_global(self) -> torch.Tensor | None: + """Routing table: physical expert ID to global expert ID.""" + tables = self.expert_map_manager.routing_tables + return tables[1] if tables else None - expert_data = param.data[expert_id] - if shard_id == "w2": - expert_data.copy_(loaded_weight) - elif shard_id in ("w1", "w3"): - # BNB inflight quantization has already sharded the weights - full_load = True - self._load_w13( - shard_id=shard_id, - shard_dim=shard_dim, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=self.tp_rank, - load_full=full_load, - ) - return True if return_success else None + @property + def expert_local_to_global(self) -> torch.Tensor | None: + """Routing table: local expert ID to global expert ID.""" + tables = self.expert_map_manager.routing_tables + return tables[2] if tables else None - shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id] - if is_transposed: - shard_dim = int(not shard_dim) + @property + def is_internal_router(self) -> bool: + # By default, router/gate is called before FusedMoE forward pass + return self._runner.is_internal_router - full_load = len(loaded_weight.shape) == 3 - if full_load: - shard_dim += 1 + @property + def is_monolithic(self) -> bool: + return self._runner.quant_method.is_monolithic - # Materialize GGUF UninitializedParameter accounting merged weights - if is_gguf_weight and isinstance(param, UninitializedParameter): - # To materialize a tensor, we must have full shape including - # number of experts, making this portion to require `full_load`. - assert full_load - final_shape = list(loaded_weight.shape) - # w1 and w3 are merged per expert. - if shard_id in {"w1", "w3"}: - final_shape[1] *= 2 - final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size - param.materialize(final_shape, dtype=loaded_weight.dtype) + @property + def shared_experts(self) -> SharedExperts | None: + return self._runner.shared_experts - expert_data = param.data if full_load else param.data[expert_id] + # + # Expert maps + # - # Case input scale: input_scale loading is only supported for fp8 - if "input_scale" in weight_name: - # this is needed for compressed-tensors only - loaded_weight = loaded_weight.to(param.data.device) + @property + def expert_map(self) -> torch.Tensor | None: + return self.routed_experts.expert_map - if ( - "compressed" in quant_method_name.lower() - and param.data[expert_id] != 1 - and (param.data[expert_id] - loaded_weight).abs() > 1e-5 - ): - raise ValueError( - "input_scales of w1 and w3 of a layer " - f"must be equal. But got {param.data[expert_id]} " - f"vs. {loaded_weight}" - ) + def _maybe_init_expert_routing_tables( + self, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: + return self.routed_experts._maybe_init_expert_routing_tables() - self._load_single_value( - param=param, - loaded_weight=loaded_weight, - expert_id=global_expert_id if use_global_sf else expert_id, + def update_expert_map(self): + self.routed_experts.update_expert_map() + + def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: + """Map global expert ID to local expert ID.""" + return self.routed_experts._map_global_expert_id_to_local_expert_id(expert_id) + + # + # EPLB + # + + def _init_aiter_shared_experts_topK_buffer( + self, vllm_config: VllmConfig, dp_size: int + ): + if self.num_fused_shared_experts > 0: + init_aiter_topK_meta_data( + n_routed_experts=self.global_num_experts, + n_shared_experts=self.num_fused_shared_experts, + top_k=self.top_k, + tp_rank=self.ep_rank if self.use_ep else self.tp_rank, + tp_size=self.ep_size if self.use_ep else self.tp_size, + shared_experts_score=1.0, + max_num_tokens=vllm_config.scheduler_config.max_num_batched_tokens + * dp_size, + is_EP=self.use_ep, ) - return True if return_success else None + # HACK + self.expert_map_manager._local_num_experts += self.num_fused_shared_experts - # Case g_idx - if "g_idx" in weight_name: - self._load_g_idx( - shard_dim=0, - shard_id=shard_id, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=self.tp_rank, - ) - return True if return_success else None + def get_expert_weights(self) -> Iterable[torch.Tensor]: + """Delegate to EPLB manager.""" + if self._runner.router.eplb_manager is not None: + return self._runner.router.eplb_manager.get_expert_weights(self) + else: + return [] - # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern - if "ModelOpt" in quant_method_name: - # Determine per-tensor weight scale patterns based on variant - # Use the dedicated method instead of brittle string matching - uses_weight_scale_2 = ( - self._runner.quant_method.uses_weight_scale_2_pattern() - ) - quant_method = getattr(param, "quant_method", None) + def set_eplb_state( + self, + moe_layer_idx: int, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + """ + Register the EPLB state in this layer. - # Call _load_per_tensor_weight_scale() to load per-tensor (scalar) - # weights scales. - # Input scales are always per-tensor. - # Weight scales: FP4 uses "weight_scale_2" and FP8 uses - # "weight_scale" for per-tensor scales. - # NOTE: ModelOpt MXFP8 MoE uses block scales in weight_scale - # tensors (quant_method=BLOCK), so those must not be treated - # as per-tensor scalars here. - is_block_weight_scale = ( - "weight_scale" in weight_name - and quant_method == FusedMoeWeightScaleSupported.BLOCK.value + This is used later in forward pass, where we get the expert mapping + and record the load metrics in `expert_load_view`. + """ + if self._runner.router.eplb_manager is not None: + self._runner.router.eplb_manager.set_state( + moe_layer_idx, + expert_load_view, + logical_to_physical_map, + logical_replica_count, ) - is_per_tensor = ( - "weight_scale_2" in weight_name - if uses_weight_scale_2 - else "weight_scale" in weight_name - ) or "input_scale" in weight_name - is_per_tensor = is_per_tensor and not is_block_weight_scale - if is_per_tensor: - self._load_per_tensor_weight_scale( - shard_id=shard_id, - param=param, - loaded_weight=loaded_weight, - expert_id=expert_id, - ) - return True if return_success else None - - # If the weight is w13_weight_scale and w13_weight_scales are - # combined into single loaded_weight, call - # _load_combined_w13_weight_scale() to load it. - # This is checked by comparing the hidden_out dims of the - # loaded_weight and the param. - if "w13_weight_scale" in weight_name: - loaded_weight_hidden_out = loaded_weight.shape[-2] - param_hidden_out = param.data.shape[-2] * self.tp_size - if loaded_weight_hidden_out == param_hidden_out: - self._load_combined_w13_weight_scale( - shard_dim=shard_dim, - loaded_weight=loaded_weight, - param=expert_data, - tp_rank=self.tp_rank, - ) - return True if return_success else None - # For other weights, call _load_model_weight_or_group_weight_scale() - # to load it. - if "weight" in weight_name: - self._load_model_weight_or_group_weight_scale( - shard_id=shard_id, - shard_dim=shard_dim, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=self.tp_rank, - ) - return True if return_success else None + @classmethod + def make_expert_params_mapping( + cls, + model: torch.nn.Module, + ckpt_gate_proj_name: str, + ckpt_down_proj_name: str, + ckpt_up_proj_name: str, + num_experts: int, + num_redundant_experts: int = 0, + ) -> list[tuple[str, str, int, str]]: + """Delegate to EPLB manager.""" + return EplbManager.make_expert_params_mapping( + model, + ckpt_gate_proj_name, + ckpt_down_proj_name, + ckpt_up_proj_name, + num_experts, + num_redundant_experts, + ) - # Case weight scales, zero_points and offset, weight/input global scales - if "scale" in weight_name or "zero" in weight_name or "offset" in weight_name: - # load the weight scales and zp based on the quantization scheme - # supported weight scales/zp can be found in - # FusedMoeWeightScaleSupported - # TODO @dsikka: once hardened, refactor to use vLLM Parameters - # specific to each case - quant_method = getattr(param, "quant_method", None) - if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value: - self._load_per_channel_weight_scale( - shard_id=shard_id, - shard_dim=shard_dim, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=self.tp_rank, - ) - elif quant_method in [ - FusedMoeWeightScaleSupported.GROUP.value, - FusedMoeWeightScaleSupported.BLOCK.value, - ]: - self._load_model_weight_or_group_weight_scale( - shard_id=shard_id, - shard_dim=shard_dim, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=self.tp_rank, - load_full_w2=getattr(param, "load_full_w2", False), - ) - elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value: - self._load_per_tensor_weight_scale( - shard_id=shard_id, - param=param, - loaded_weight=loaded_weight, - expert_id=expert_id, - ) - else: - WEIGHT_SCALE_SUPPORTED = [e.value for e in FusedMoeWeightScaleSupported] - raise ValueError( - f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}" - ) - return True if return_success else None + # + # Weight Loading (Delegated to RoutedExperts) + # - # Case weight_shape - if "weight_shape" in weight_name: - # only required by compressed-tensors - self._load_single_value( - param=param, loaded_weight=loaded_weight, expert_id=expert_id - ) - return True if return_success else None + @overload + def weight_loader( + self, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, + shard_id: str, + expert_id: int, + return_success: Literal[False], + ) -> None: ... - # Case model weights - if "weight" in weight_name: - self._load_model_weight_or_group_weight_scale( - shard_id=shard_id, - shard_dim=shard_dim, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=self.tp_rank, - ) - return True if return_success else None + @overload + def weight_loader( + self, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, + shard_id: str, + expert_id: int, + return_success: Literal[True], + ) -> bool: ... - return False if return_success else None + def weight_loader( + self, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, + shard_id: str, + expert_id: int, + return_success: bool = False, + ) -> bool | None: + """Delegate to RoutedExperts.""" + return self.routed_experts.weight_loader( + param=param, + loaded_weight=loaded_weight, + weight_name=weight_name, + shard_id=shard_id, + expert_id=expert_id, + return_success=return_success, + ) def load_weights( self, weights: Iterable[tuple[str, torch.Tensor]] ) -> Iterable[str]: - if (expert_mapping := self.expert_mapping) is None: - raise ValueError( - "`self.expert_mapping` must be provided to " - "load weights using `self.load_weights`." - ) - for expert_name, loaded_weight in weights: - qual_name = f"{self.layer_name}.{expert_name}" - for param_name, weight_name, expert_id, shard_id in expert_mapping: - if weight_name not in qual_name: - continue - weight_name = qual_name.replace(weight_name, param_name) - param_name = weight_name.removeprefix(f"{self.layer_name}.") - param = getattr(self, param_name) - # Fused expert weights can be identified by their 3D tensors - if loaded_weight.dim() == 3: - # Repurpose expert_id as shard_idx for deconcatenating w1 and w3 - if shard_id in {"w1", "w3"}: - shard_idx = expert_id - experts_shard = loaded_weight.chunk(2, dim=1)[shard_idx] - else: - experts_shard = loaded_weight - start = 0 - else: - # loaded_weight is a single expert weight, so we add a dummy expert - # dimension to unify the loading logic with the fused case - experts_shard = loaded_weight.unsqueeze(0) - start = expert_id - - # Unified loading logic for fused and non-fused experts - loaded_experts = experts_shard.unbind() - for expert_id, loaded_expert in enumerate(loaded_experts, start=start): - success = self.weight_loader( - param=param, - loaded_weight=loaded_expert, - weight_name=weight_name, - shard_id=shard_id, - expert_id=expert_id, - return_success=True, - ) - if success: - logger.debug( - "Loaded expert %d of shard %s into %s for layer %s", - expert_id, - shard_id, - param_name, - self.layer_name, - ) - yield param_name + """Delegate to RoutedExperts.""" + return self.routed_experts.load_weights(weights) # # Execution diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index c728a81375a6..dd2adcb6d5b5 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -88,7 +88,7 @@ def _moe_forward( ) -> torch.Tensor: layer = get_layer_from_name(_resolve_layer_name(layer_name)) return layer._runner._forward_dispatch( - layer, + layer.routed_experts, hidden_states, router_logits, shared_experts_input, @@ -112,7 +112,7 @@ def _moe_forward_shared( ) -> tuple[torch.Tensor, torch.Tensor]: layer = get_layer_from_name(_resolve_layer_name(layer_name)) return layer._runner._forward_dispatch( - layer, + layer.routed_experts, hidden_states, router_logits, shared_experts_input, @@ -464,19 +464,19 @@ def _apply_quant_method( ) if self.quant_method.is_monolithic: - fused_out = self.quant_method.apply_monolithic( - layer=layer, + # Monolithic kernels: pass router_logits to routed_experts + fused_out = layer.forward( x=hidden_states, router_logits=router_logits, ) else: + # Modular kernels: select experts first, then call routed_experts topk_weights, topk_ids = self.router.select_experts( hidden_states=hidden_states, router_logits=router_logits, ) - fused_out = self.quant_method.apply( - layer=layer, + fused_out = layer.forward( x=hidden_states, topk_weights=topk_weights, topk_ids=topk_ids, diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index e2a5d05320d0..1b3e40cc6efb 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -88,7 +88,7 @@ def _select_monolithic(self) -> Callable: def forward_native( self, - layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -292,7 +292,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: def apply( self, - layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -317,7 +317,7 @@ def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantCon def forward_cuda( self, - layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -340,7 +340,7 @@ def forward_cuda( def forward_monolithic_cuda( self, - layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor: @@ -366,7 +366,7 @@ def forward_monolithic_cuda( def forward_monolithic_cpu( self, - layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor: diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 3cf3116f0670..3244aa8c10ad 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -8,7 +8,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.layer import FusedMoE +from vllm.model_executor.layers.fused_moe.layer import FusedMoE, RoutedExperts from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index bcb3a43effee..c7a1731fdd4f 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -23,6 +23,7 @@ FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, + RoutedExperts, UnquantizedFusedMoEMethod, ) from vllm.model_executor.layers.linear import ( @@ -800,7 +801,7 @@ def select_gemm_impl( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 729924663646..9deaf7cc08e3 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -13,6 +13,7 @@ from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, + RoutedExperts, ) from vllm.model_executor.layers.linear import ( LinearBase, @@ -478,7 +479,7 @@ def get_fused_moe_quant_config( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 4fcc468c6cfb..b4529d9485d1 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -24,7 +24,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import FusedMoE, RoutedExperts from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index d337eb37c32e..3bc4c3b85a52 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -17,11 +17,11 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( - FusedMoE, FusedMoEActivationFormat, FusedMoEExpertsModular, FusedMoEMethodBase, FusedMoeWeightScaleSupported, + RoutedExperts, UnquantizedFusedMoEMethod, ) from vllm.model_executor.layers.fused_moe.activation import MoEActivation @@ -123,7 +123,7 @@ def get_moe_method( layer: torch.nn.Module, layer_name: str, ) -> FusedMoEMethodBase: - # FusedMoE was made by combining multiple Linears so need to + # RoutedExperts was made by combining multiple Linears so need to # make sure quantization config for Linear can target it quant_config._add_fused_moe_to_target_scheme_map() unfused_names = [ @@ -318,7 +318,7 @@ def get_fused_moe_quant_config( w2_scale=layer.w2_weight_scale, ) - def process_weights_after_loading(self, layer: FusedMoE) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: layer.w13_weight = torch.nn.Parameter( layer.w13_weight_packed.data, requires_grad=False ) @@ -350,7 +350,7 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -505,7 +505,7 @@ def create_weights( ) set_weight_attrs(w2_input_scale, extra_weight_attrs) - def process_weights_after_loading(self, layer: FusedMoE) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: """ Convert NVFP4 MoE weights into kernel format and setup the kernel. """ @@ -600,7 +600,7 @@ def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantCon def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor: @@ -623,7 +623,7 @@ def apply_monolithic( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -789,7 +789,7 @@ def create_weights( torch.ones(num_experts, dtype=torch.float32), requires_grad=False ) layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Add PER-TENSOR quantization for FusedMoE.weight_loader. + # Add PER-TENSOR quantization for RoutedExperts.weight_loader. extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} ) @@ -812,7 +812,7 @@ def create_weights( requires_grad=False, ) layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Add PER-CHANNEL quantization for FusedMoE.weight_loader. + # Add PER-CHANNEL quantization for RoutedExperts.weight_loader. extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} ) @@ -841,7 +841,7 @@ def create_weights( requires_grad=False, ) layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Add PER-CHANNEL quantization for FusedMoE.weight_loader. + # Add PER-CHANNEL quantization for RoutedExperts.weight_loader. extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value} ) @@ -865,7 +865,7 @@ def create_weights( layer.w13_input_scale = None layer.w2_input_scale = None - def process_weights_after_loading(self, layer: FusedMoE) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: # Allow for accessing weights and scales in standard way. w13 = layer.w13_weight w2 = layer.w2_weight @@ -962,7 +962,7 @@ def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantCon def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor: @@ -984,7 +984,7 @@ def apply_monolithic( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -1096,7 +1096,7 @@ def create_weights( requires_grad=False, ) layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Add PER-CHANNEL quantization for FusedMoE.weight_loader. + # Add PER-CHANNEL quantization for RoutedExperts.weight_loader. extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} ) @@ -1124,7 +1124,7 @@ def get_fused_moe_quant_config( def apply( self, - layer: FusedMoE, + layer: torch.nn.Module, # RoutedExperts x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -1610,7 +1610,7 @@ def is_monolithic(self) -> bool: def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor: @@ -1635,7 +1635,7 @@ def apply_monolithic( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -1884,7 +1884,7 @@ def select_gemm_impl( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -2186,7 +2186,7 @@ def is_monolithic(self) -> bool: def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor: @@ -2338,7 +2338,7 @@ def create_weights( requires_grad=False, ) layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Add PER-GROUP quantization for FusedMoE.weight_loader. + # Add PER-GROUP quantization for RoutedExperts.weight_loader. extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.GROUP.value} ) @@ -2501,7 +2501,7 @@ def select_gemm_impl( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 301441ff019d..1c11249b0e4b 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -10,6 +10,7 @@ FusedMoE, FusedMoEConfig, FusedMoEMethodBase, + RoutedExperts, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, @@ -136,7 +137,7 @@ def get_fused_moe_quant_config( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index ceaad92e9cb1..fac71b956b69 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -24,6 +24,7 @@ FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, + RoutedExperts, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, @@ -804,7 +805,7 @@ def create_weights( def _setup_kernel( self, - layer: FusedMoE, + layer: RoutedExperts, w13: torch.Tensor, w2: torch.Tensor, w13_scale: torch.Tensor, @@ -937,7 +938,7 @@ def supports_eplb(self) -> bool: def apply_monolithic( self, - layer: FusedMoE, + layer: torch.nn.Module, # RoutedExperts x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor: @@ -960,7 +961,7 @@ def apply_monolithic( def apply( self, - layer: FusedMoE, + layer: torch.nn.Module, # RoutedExperts x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index cf72c83d717a..141f85f81779 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -23,6 +23,7 @@ from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, + RoutedExperts, ) from vllm.model_executor.layers.linear import ( LinearBase, @@ -632,7 +633,7 @@ def get_fused_moe_quant_config( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 154347a930a9..bf194c29c1d0 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -12,7 +12,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.layer import FusedMoE +from vllm.model_executor.layers.fused_moe.layer import FusedMoE, RoutedExperts from vllm.model_executor.layers.linear import LinearMethodBase from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 0b43fbd392c9..528f58fd4f75 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -23,6 +23,7 @@ FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, + RoutedExperts, UnquantizedFusedMoEMethod, ) from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs @@ -896,7 +897,7 @@ def select_gemm_impl( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index a5e1ea52f557..23543af51db8 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -23,6 +23,7 @@ from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoeWeightScaleSupported, + RoutedExperts, ) from vllm.model_executor.layers.fused_moe.oracle.fp8 import ( Fp8MoeBackend, @@ -844,7 +845,7 @@ def create_weights( def _setup_kernel( self, - layer: FusedMoE, + layer: RoutedExperts, w13: torch.Tensor, w2: torch.Tensor, w13_scale: torch.Tensor, @@ -929,7 +930,7 @@ def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantCon def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor: @@ -952,7 +953,7 @@ def apply_monolithic( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -1337,7 +1338,7 @@ def create_weights( ) layer.register_parameter("w2_input_scale", w2_input_scale) - def process_weights_after_loading(self, layer: FusedMoE) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: """ Convert NVFP4 MoE weights into kernel format and setup the kernel. """ @@ -1413,7 +1414,7 @@ def supports_eplb(self) -> bool: def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor: @@ -1436,7 +1437,7 @@ def apply_monolithic( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -1946,7 +1947,7 @@ def is_monolithic(self) -> bool: def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: @@ -2029,7 +2030,7 @@ def apply_monolithic( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index a327ac17bbc9..acd597c72fae 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -17,6 +17,7 @@ FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported, + RoutedExperts, ) from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( UnquantizedFusedMoEMethod, @@ -364,7 +365,7 @@ def get_fused_moe_quant_config( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 57a9a5e14c02..9e7c22f57f41 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -10,6 +10,7 @@ FusedMoE, FusedMoEConfig, FusedMoEMethodBase, + RoutedExperts, ) from vllm.model_executor.layers.fused_moe import modular_kernel as mk from vllm.model_executor.layers.fused_moe.config import ( @@ -233,7 +234,7 @@ def create_weights( def _setup_kernel( self, - layer: FusedMoE, + layer: RoutedExperts, w13: torch.Tensor, w2: torch.Tensor, w13_scale: torch.Tensor, @@ -376,7 +377,7 @@ def select_gemm_impl( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -399,7 +400,7 @@ def apply( def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor: diff --git a/vllm/model_executor/layers/quantization/mxfp8.py b/vllm/model_executor/layers/quantization/mxfp8.py index 5b4564bea31c..9156e5539228 100644 --- a/vllm/model_executor/layers/quantization/mxfp8.py +++ b/vllm/model_executor/layers/quantization/mxfp8.py @@ -13,6 +13,7 @@ from vllm.model_executor.layers.fused_moe import ( FusedMoE, FusedMoEMethodBase, + RoutedExperts, ) from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod from vllm.model_executor.layers.fused_moe.oracle.mxfp8 import ( diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index 78c64bac6187..a06ec65e8115 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -8,7 +8,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import FusedMoE, RoutedExperts from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 7ce92523d0ee..b1c29cccad93 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -11,11 +11,11 @@ from vllm.config import get_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( - FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported, MoEActivation, + RoutedExperts, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, @@ -224,7 +224,7 @@ def create_weights( torch.ones(num_experts, dtype=torch.float32), requires_grad=False ) layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Add PER-TENSOR quantization for FusedMoE.weight_loader. + # Add PER-TENSOR quantization for RoutedExperts.weight_loader. extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} ) @@ -246,7 +246,7 @@ def create_weights( requires_grad=False, ) layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Add PER-CHANNEL quantization for FusedMoE.weight_loader. + # Add PER-CHANNEL quantization for RoutedExperts.weight_loader. extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} ) @@ -441,7 +441,7 @@ def get_fused_moe_quant_config( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -630,7 +630,7 @@ def get_fused_moe_quant_config(self, layer): def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -1038,7 +1038,7 @@ def get_fused_moe_quant_config( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index 66300ceaefab..3d7d94e68b6b 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -20,7 +20,7 @@ ) if TYPE_CHECKING: - from vllm.model_executor.layers.fused_moe.layer import FusedMoE + from vllm.model_executor.layers.fused_moe.layer import RoutedExperts from vllm.model_executor.layers.fused_moe.oracle.nvfp4 import ( NvFp4MoeBackend, ) @@ -191,7 +191,7 @@ def prepare_static_weights_for_trtllm_fp4_moe( def prepare_nvfp4_moe_layer_for_fi_or_cutlass( backend: "NvFp4MoeBackend", - layer: "FusedMoE", + layer: "RoutedExperts", w13: torch.Tensor, w13_scale: torch.Tensor, w13_scale_2: torch.Tensor, diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index 1cafccd49670..babee6e081e8 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -99,12 +99,13 @@ def _extract_data_from_linear_base_module( def _extract_data_from_fused_moe_module( - m: torch.nn.Module, + m_: torch.nn.Module, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int]: """ Extract weights, weight scales and num_topk from FusedMoE module. """ - assert isinstance(m, FusedMoE) + assert isinstance(m_, FusedMoE) + m = m_.routed_experts w13 = m.w13_weight w13_s = ( m.w13_weight_scale_inv diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c5f674d8ccc4..0bc19c0bba2d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -6621,13 +6621,12 @@ def init_routed_experts_capturer(self): self.routed_experts_initialized = True def _bind_routed_experts_capturer(self, capturer: RoutedExpertsCapturer) -> None: - from vllm.model_executor.layers.fused_moe.layer import FusedMoE - from vllm.model_executor.layers.fused_moe.router.base_router import ( - BaseRouter, - ) + from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoERouter for module in self.compilation_config.static_forward_context.values(): - if isinstance(module, FusedMoE) and isinstance(module.router, BaseRouter): + if isinstance(module, FusedMoE) and isinstance( + module.router, FusedMoERouter + ): layer_id = module.layer_id def _capture_fn(topk_ids, _layer_id=layer_id, _capturer=capturer): From 8dba22b79b082701c3a21de63b75c7c75048a64e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 26 Mar 2026 21:18:40 +0000 Subject: [PATCH 089/191] move RoutedExperts to separate file Signed-off-by: Bill Nell --- .../layers/fused_moe/__init__.py | 6 +- vllm/model_executor/layers/fused_moe/layer.py | 716 +---------------- .../layers/fused_moe/routed_experts.py | 742 ++++++++++++++++++ .../model_executor/layers/quantization/awq.py | 2 +- .../layers/quantization/awq_marlin.py | 12 +- .../compressed_tensors/compressed_tensors.py | 2 +- .../layers/quantization/gptq.py | 2 +- .../layers/quantization/gptq_marlin.py | 12 +- .../layers/quantization/modelopt.py | 10 +- .../layers/quantization/moe_wna16.py | 14 +- .../layers/quantization/mxfp8.py | 1 - .../layers/quantization/quark/quark.py | 2 +- 12 files changed, 775 insertions(+), 746 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/routed_experts.py diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 28bd2caf0a99..f01d4932be4f 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -18,14 +18,16 @@ ) from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, - FusedMoeWeightScaleSupported, - RoutedExperts, ) from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEActivationFormat, FusedMoEExpertsModular, FusedMoEPrepareAndFinalizeModular, ) +from vllm.model_executor.layers.fused_moe.routed_experts import ( + FusedMoeWeightScaleSupported, + RoutedExperts, +) from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( FusedMoERouter, ) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index a3145682daad..c567a1237b98 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -2,11 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable, Iterable -from enum import Enum from typing import Literal, overload import torch -from torch.nn.parameter import UninitializedParameter import vllm.envs as envs from vllm._aiter_ops import rocm_aiter_ops @@ -38,6 +36,7 @@ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( init_aiter_topK_meta_data, ) +from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts from vllm.model_executor.layers.fused_moe.router.router_factory import ( create_fused_moe_router, ) @@ -64,13 +63,6 @@ logger = init_logger(__name__) -class FusedMoeWeightScaleSupported(Enum): - TENSOR = "tensor" - CHANNEL = "channel" - GROUP = "group" - BLOCK = "block" - - # Should be method? only used in layer def determine_expert_placement_strategy( expert_placement_strategy: ExpertPlacementStrategy, @@ -145,712 +137,6 @@ def maybe_roundup_hidden_size( return hidden_size -class RoutedExperts(torch.nn.Module): - """ - Container for routed expert weights and execution logic. - - This module owns the expert weight parameters (w13_weight, w2_weight, scales, etc.) - and handles: - - Loading checkpoint weights into parameters - - Executing routed experts via quant_method.apply() - - Weight parameters are registered on this module via _ParameterRegistrationWrapper - during FusedMoE initialization. - """ - - def __init__( - self, - layer_name: str, - params_dtype: torch.dtype, - unpadded_hidden_size: int, # put in moe_config? - intermediate_size: int, - moe_config: FusedMoEConfig, - quant_config: QuantizationConfig | None, - quant_method: FusedMoEMethodBase, - expert_map_manager: ExpertMapManager, - **kwargs, - ): - super().__init__() - self.layer_name = layer_name - self.moe_config = moe_config - self.quant_config = quant_config - self.quant_method = quant_method - self.expert_map_manager = expert_map_manager - self.hidden_size = moe_config.hidden_dim - self.intermediate_size_per_partition = ( - moe_config.intermediate_size_per_partition - ) - self.global_num_experts = moe_config.num_experts - self.local_num_experts = moe_config.num_local_experts - - # Register buffers for state_dict compatibility - if self.expert_map_manager.expert_map is not None: - self.register_buffer("_expert_map", self.expert_map_manager.expert_map) - - if self.expert_map_manager.expert_mask is not None: - self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) - - # Bit of hack until things are settled - self.__dict__.update(kwargs) - - moe_quant_params = { - "num_experts": moe_config.num_local_experts, - "hidden_size": moe_config.hidden_dim, - "unpadded_hidden_size": unpadded_hidden_size, - "intermediate_size_per_partition": ( - moe_config.intermediate_size_per_partition - ), - "params_dtype": params_dtype, - "weight_loader": self.weight_loader, - "global_num_experts": moe_config.num_experts, - } - - # need full intermediate size pre-sharding for WNA16 act order - if self._needs_intermediate_size_param(quant_method): - moe_quant_params["intermediate_size_full"] = intermediate_size - - quant_method.create_weights(layer=self, **moe_quant_params) - - # TODO(bnell): make this a method on quant_method - def _needs_intermediate_size_param(self, quant_method: FusedMoEMethodBase) -> bool: - return quant_method.__class__.__name__ in ( - "GPTQMarlinMoEMethod", - "CompressedTensorsWNA16MarlinMoEMethod", - "CompressedTensorsWNA16MoEMethod", - ) - - def _ensure_moe_quant_config_init(self): - if self.quant_method.moe_quant_config is None: - # Note: the moe_quant_config can't be constructed until after - # weight loading post processing. - self.quant_method.moe_quant_config = ( - self.quant_method.get_fused_moe_quant_config(self) - ) - - @property - def expert_map(self) -> torch.Tensor | None: - return ( - self.expert_map_manager.expert_map - if not self.rocm_aiter_fmoe_enabled - else self.expert_map_manager.expert_mask - ) - - def _maybe_init_expert_routing_tables( - self, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: - """Get routing tables (already initialized by manager).""" - # Return routing tables from manager - routing_tables = self.expert_map_manager.routing_tables - - if routing_tables is None: - return None - - # Register buffers for backward compatibility if not already registered - if not hasattr(self, "expert_global_to_physical"): - global_to_physical, physical_to_global, local_global = routing_tables - self.register_buffer("expert_global_to_physical", global_to_physical) - self.register_buffer("expert_physical_to_global", physical_to_global) - self.register_buffer("expert_local_to_global", local_global) - - return routing_tables - - def update_expert_map(self): - """Update expert mappings for new EP configuration.""" - # ep_size and ep_rank should already be updated in moe_parallel_config - self.expert_map_manager.update() - - # Re-register buffers for state_dict compatibility - self.register_buffer("_expert_map", self.expert_map_manager.expert_map) - self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) - - # Update routing table buffers if needed - self._maybe_init_expert_routing_tables() - - # Handle AITER shared experts if needed - if self.aiter_fmoe_shared_expert_enabled: - self._init_aiter_shared_experts_topK_buffer( - vllm_config=get_current_vllm_config(), - dp_size=get_dp_group().world_size, - ) - - def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: - """Map global expert ID to local expert ID.""" - return self.expert_map_manager.map_global_to_local(expert_id) - - # - # Weight Loading Methods - # - - def _load_per_tensor_weight_scale( - self, - shard_id: str, - param: torch.nn.Parameter, - loaded_weight: torch.Tensor, - expert_id: int, - ): - param_data = param.data - # for per tensor weight quantization - if shard_id in ("w1", "w3"): - # We have to keep the weight scales of w1 and w3 because - # we need to re-quantize w1/w3 weights after weight loading. - idx = 0 if shard_id == "w1" else 1 - param_data[expert_id][idx] = loaded_weight - # If we are in the row parallel case (down_proj) - elif shard_id == "w2": - param_data[expert_id] = loaded_weight - - def _load_combined_w13_weight_scale( - self, - shard_dim: int, - loaded_weight: torch.Tensor, - param: torch.Tensor, - tp_rank: int, - ): - """ - Load w13 weight scales assuming that w1 weight scales and w3 weight - scales are stored in the same loaded_weight tensor. - """ - shard_size = param.shape[shard_dim] - loaded_weight = loaded_weight.narrow( - shard_dim, shard_size * tp_rank, shard_size - ) - param.copy_(loaded_weight) - - def _load_model_weight_or_group_weight_scale( - self, - shard_dim: int, - expert_data: torch.Tensor, - shard_id: str, - loaded_weight: torch.Tensor, - tp_rank: int, - load_full_w2: bool = False, - ): - """ - Load grouped weight scales for group quantization or model weights - :param shard_dim: dimension to shard - :param expert_data: parameter for a particular expert - :param shard_id: either w1, w2, or w3 - :param loaded_weight: checkpoint weight to load into the param - :param tp_rank: tensor parallel rank - :param load_full_w2: whether or not the w2 loaded should be sharded. - """ - if shard_id == "w2": - # In the case where we have actorder/g_idx, we do not partition the - # w2 scales, as indicated by `load_full` argument, for all tp cases - self._load_w2( - shard_dim=shard_dim, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=tp_rank, - load_full=load_full_w2, - ) - elif shard_id in ("w1", "w3"): - self._load_w13( - shard_id=shard_id, - shard_dim=shard_dim, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=tp_rank, - ) - - def _load_per_channel_weight_scale( - self, - expert_data: torch.Tensor, - shard_dim: int, - shard_id: str, - loaded_weight: torch.Tensor, - tp_rank: int, - ): - # for per channel weight quantization - if shard_id == "w2": - expert_data.copy_(loaded_weight) - elif shard_id in ("w1", "w3"): - self._load_w13( - shard_id=shard_id, - shard_dim=shard_dim, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=tp_rank, - ) - - def _load_w13( - self, - expert_data: torch.Tensor, - shard_dim: int, - shard_id: str, - loaded_weight: torch.Tensor, - tp_rank: int, - load_full: bool = False, - ): - # Index the loaded weight for tp sharding. - # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim - if self.moe_config.is_act_and_mul: - shard_size = expert_data.shape[shard_dim] // 2 - else: - shard_size = expert_data.shape[shard_dim] - # Only narrow if the loaded_weight is not a scalar (0-dim tensor) - # and we're not loading the full weight - if not load_full and loaded_weight.ndim > 0: - loaded_weight = loaded_weight.narrow( - shard_dim, shard_size * tp_rank, shard_size - ) - # Narrow parameter and load. - # w1, gate_proj: Load into first logical weight of w13. - if shard_id == "w1": - expert_data = expert_data.narrow(shard_dim, 0, shard_size) - # w3, up_proj: Load into second logical weight of w13. - else: - assert shard_id == "w3" - expert_data = expert_data.narrow(shard_dim, shard_size, shard_size) - expert_data.copy_(loaded_weight) - - def _load_w2( - self, - expert_data: torch.Tensor, - shard_dim: int, - loaded_weight: torch.Tensor, - tp_rank: int, - load_full: bool = False, - ): - # Index the loaded weight for tp sharding. - # down_proj: "RowParallel" so tp sharding on input_dim - # Narrow parameter and load. - shard_size = expert_data.shape[shard_dim] - # Only narrow if the loaded_weight is not a scalar (0-dim tensor) - # and we're not loading the full weight - if not load_full and loaded_weight.ndim > 0: - loaded_weight = loaded_weight.narrow( - shard_dim, shard_size * tp_rank, shard_size - ) - # w2, down_proj: Load into only logical weight of w2. - expert_data.copy_(loaded_weight) - - def _load_single_value( - self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int - ): - param_data = param.data - - # Input scales can be loaded directly and should be equal. - param_data[expert_id] = loaded_weight - - def _load_g_idx( - self, - shard_id: str, - expert_data: torch.Tensor, - shard_dim: int, - loaded_weight: torch.Tensor, - tp_rank: int, - ): - if shard_id == "w2": - self._load_w2( - shard_dim=shard_dim, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=tp_rank, - ) - else: - assert shard_id in ("w1", "w3") - expert_data.copy_(loaded_weight) - - @overload - def weight_loader( - self, - param: torch.nn.Parameter, - loaded_weight: torch.Tensor, - weight_name: str, - shard_id: str, - expert_id: int, - return_success: Literal[False], - ) -> None: ... - - @overload - def weight_loader( - self, - param: torch.nn.Parameter, - loaded_weight: torch.Tensor, - weight_name: str, - shard_id: str, - expert_id: int, - return_success: Literal[True], - ) -> bool: ... - - def weight_loader( - self, - param: torch.nn.Parameter, - loaded_weight: torch.Tensor, - weight_name: str, - shard_id: str, - expert_id: int, - return_success: bool = False, - ) -> bool | None: - if self.quant_config and self.quant_config.get_name() == "mxfp4": - # (FIXME) for gpt-oss all experts are combined - if "bias" in weight_name: - dim1 = loaded_weight.shape[1] - param.data[:, :dim1].copy_(loaded_weight) - else: - dim1 = loaded_weight.shape[1] - dim2 = loaded_weight.shape[2] - param.data[:, :dim1, :dim2].copy_(loaded_weight) - return True if return_success else None - - quant_method_name = self.quant_method.__class__.__name__ - global_expert_id = expert_id - expert_id = self.layer._map_global_expert_id_to_local_expert_id( - global_expert_id - ) - - use_global_sf = ( - getattr(self.quant_method, "use_global_sf", False) - and "input_scale" in weight_name - ) - - if expert_id == -1 and not use_global_sf: - # Failed to load this param since it's not local to this rank - return False if return_success else None - # Hereafter, `expert_id` is local physical id - - # is_transposed: if the dim to shard the weight - # should be flipped. Required by GPTQ, compressed-tensors - # should be whatever dimension intermediate_size_per_partition is - is_transposed = getattr(param, "is_transposed", False) - - # compressed-tensors checkpoints with packed weights are stored flipped - # TODO (mgoin): check self.layer._runner.quant_method.quant_config.quant_format - # against known CompressionFormat enum values that have this quality - if quant_method_name in ( - "CompressedTensorsWNA16MarlinMoEMethod", - "CompressedTensorsWNA16MoEMethod", - ): - if is_transposed: - loaded_weight = loaded_weight.t().contiguous() - else: - loaded_weight = loaded_weight - - if shard_id not in ("w1", "w2", "w3"): - raise ValueError(f"shard_id must be ['w1','w2','w3'] but got {shard_id}.") - - # Fetch the dim to shard the parameter/loaded weight - # based on the shard id. This will be whatever - # dimension intermediate_size_per_partition is used. - SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0} - - is_gguf_weight = getattr(param, "is_gguf_weight", False) - is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) - if is_gguf_weight_type: - param.weight_type = loaded_weight.item() - param.data.copy_(loaded_weight) - return True if return_success else None - - # Case for BitsAndBytes - use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) - if use_bitsandbytes_4bit: - shard_dim = 0 - - expert_data = param.data[expert_id] - if shard_id == "w2": - expert_data.copy_(loaded_weight) - elif shard_id in ("w1", "w3"): - # BNB inflight quantization has already sharded the weights - full_load = True - self._load_w13( - shard_id=shard_id, - shard_dim=shard_dim, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=self.moe_config.tp_rank, - load_full=full_load, - ) - return True if return_success else None - - shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id] - if is_transposed: - shard_dim = int(not shard_dim) - - full_load = len(loaded_weight.shape) == 3 - if full_load: - shard_dim += 1 - - # Materialize GGUF UninitializedParameter accounting merged weights - if is_gguf_weight and isinstance(param, UninitializedParameter): - # To materialize a tensor, we must have full shape including - # number of experts, making this portion to require `full_load`. - assert full_load - final_shape = list(loaded_weight.shape) - # w1 and w3 are merged per expert. - if shard_id in {"w1", "w3"}: - final_shape[1] *= 2 - final_shape[shard_dim] = final_shape[shard_dim] // self.moe_config.tp_size - param.materialize(final_shape, dtype=loaded_weight.dtype) - - expert_data = param.data if full_load else param.data[expert_id] - - # Case input scale: input_scale loading is only supported for fp8 - if "input_scale" in weight_name: - # this is needed for compressed-tensors only - loaded_weight = loaded_weight.to(param.data.device) - - if ( - "compressed" in quant_method_name.lower() - and param.data[expert_id] != 1 - and (param.data[expert_id] - loaded_weight).abs() > 1e-5 - ): - raise ValueError( - "input_scales of w1 and w3 of a layer " - f"must be equal. But got {param.data[expert_id]} " - f"vs. {loaded_weight}" - ) - - self._load_single_value( - param=param, - loaded_weight=loaded_weight, - expert_id=global_expert_id if use_global_sf else expert_id, - ) - return True if return_success else None - - # Case g_idx - if "g_idx" in weight_name: - self._load_g_idx( - shard_dim=0, - shard_id=shard_id, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=self.moe_config.tp_rank, - ) - return True if return_success else None - - # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern - if "ModelOpt" in quant_method_name: - # Determine per-tensor weight scale patterns based on variant - # Use the dedicated method instead of brittle string matching - uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern() - quant_method = getattr(param, "quant_method", None) - - # Call _load_per_tensor_weight_scale() to load per-tensor (scalar) - # weights scales. - # Input scales are always per-tensor. - # Weight scales: FP4 uses "weight_scale_2" and FP8 uses - # "weight_scale" for per-tensor scales. - # NOTE: ModelOpt MXFP8 MoE uses block scales in weight_scale - # tensors (quant_method=BLOCK), so those must not be treated - # as per-tensor scalars here. - is_block_weight_scale = ( - "weight_scale" in weight_name - and quant_method == FusedMoeWeightScaleSupported.BLOCK.value - ) - is_per_tensor = ( - "weight_scale_2" in weight_name - if uses_weight_scale_2 - else "weight_scale" in weight_name - ) or "input_scale" in weight_name - is_per_tensor = is_per_tensor and not is_block_weight_scale - if is_per_tensor: - self._load_per_tensor_weight_scale( - shard_id=shard_id, - param=param, - loaded_weight=loaded_weight, - expert_id=expert_id, - ) - return True if return_success else None - - # If the weight is w13_weight_scale and w13_weight_scales are - # combined into single loaded_weight, call - # _load_combined_w13_weight_scale() to load it. - # This is checked by comparing the hidden_out dims of the - # loaded_weight and the param. - if "w13_weight_scale" in weight_name: - loaded_weight_hidden_out = loaded_weight.shape[-2] - param_hidden_out = param.data.shape[-2] * self.moe_config.tp_size - if loaded_weight_hidden_out == param_hidden_out: - self._load_combined_w13_weight_scale( - shard_dim=shard_dim, - loaded_weight=loaded_weight, - param=expert_data, - tp_rank=self.moe_config.tp_rank, - ) - return True if return_success else None - - # For other weights, call _load_model_weight_or_group_weight_scale() - # to load it. - if "weight" in weight_name: - self._load_model_weight_or_group_weight_scale( - shard_id=shard_id, - shard_dim=shard_dim, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=self.moe_config.tp_rank, - ) - return True if return_success else None - - # Case weight scales, zero_points and offset, weight/input global scales - if "scale" in weight_name or "zero" in weight_name or "offset" in weight_name: - # load the weight scales and zp based on the quantization scheme - # supported weight scales/zp can be found in - # FusedMoeWeightScaleSupported - # TODO @dsikka: once hardened, refactor to use vLLM Parameters - # specific to each case - quant_method = getattr(param, "quant_method", None) - if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value: - self._load_per_channel_weight_scale( - shard_id=shard_id, - shard_dim=shard_dim, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=self.moe_config.tp_rank, - ) - elif quant_method in [ - FusedMoeWeightScaleSupported.GROUP.value, - FusedMoeWeightScaleSupported.BLOCK.value, - ]: - self._load_model_weight_or_group_weight_scale( - shard_id=shard_id, - shard_dim=shard_dim, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=self.moe_config.tp_rank, - load_full_w2=getattr(param, "load_full_w2", False), - ) - elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value: - self._load_per_tensor_weight_scale( - shard_id=shard_id, - param=param, - loaded_weight=loaded_weight, - expert_id=expert_id, - ) - else: - WEIGHT_SCALE_SUPPORTED = [e.value for e in FusedMoeWeightScaleSupported] - raise ValueError( - f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}" - ) - return True if return_success else None - - # Case weight_shape - if "weight_shape" in weight_name: - # only required by compressed-tensors - self._load_single_value( - param=param, loaded_weight=loaded_weight, expert_id=expert_id - ) - return True if return_success else None - - # Case model weights - if "weight" in weight_name: - self._load_model_weight_or_group_weight_scale( - shard_id=shard_id, - shard_dim=shard_dim, - loaded_weight=loaded_weight, - expert_data=expert_data, - tp_rank=self.moe_config.tp_rank, - ) - return True if return_success else None - - return False if return_success else None - - def load_weights( - self, weights: Iterable[tuple[str, torch.Tensor]] - ) -> Iterable[str]: - if (expert_mapping := self.layer.expert_mapping) is None: - raise ValueError( - "`self.layer.expert_mapping` must be provided to " - "load weights using `self.load_weights`." - ) - for expert_name, loaded_weight in weights: - qual_name = f"{self.layer_name}.{expert_name}" - for param_name, weight_name, expert_id, shard_id in expert_mapping: - if weight_name not in qual_name: - continue - weight_name = qual_name.replace(weight_name, param_name) - param_name = weight_name.removeprefix(f"{self.layer_name}.") - param = getattr(self, param_name) - # Fused expert weights can be identified by their 3D tensors - if loaded_weight.dim() == 3: - # Repurpose expert_id as shard_idx for deconcatenating w1 and w3 - if shard_id in {"w1", "w3"}: - shard_idx = expert_id - experts_shard = loaded_weight.chunk(2, dim=1)[shard_idx] - else: - experts_shard = loaded_weight - start = 0 - else: - # loaded_weight is a single expert weight, so we add a dummy expert - # dimension to unify the loading logic with the fused case - experts_shard = loaded_weight.unsqueeze(0) - start = expert_id - - # Unified loading logic for fused and non-fused experts - loaded_experts = experts_shard.unbind() - for expert_id, loaded_expert in enumerate(loaded_experts, start=start): - success = self.weight_loader( - param=param, - loaded_weight=loaded_expert, - weight_name=weight_name, - shard_id=shard_id, - expert_id=expert_id, - return_success=True, - ) - if success: - logger.debug( - "Loaded expert %d of shard %s into %s for layer %s", - expert_id, - shard_id, - param_name, - self.layer_name, - ) - yield param_name - - # - # Execution - # - - def forward( - self, - x: torch.Tensor, - topk_weights: torch.Tensor | None = None, - topk_ids: torch.Tensor | None = None, - router_logits: torch.Tensor | None = None, - shared_experts_input: torch.Tensor | None = None, - ) -> torch.Tensor: - """ - Execute routed experts using the quantization method's apply function. - - This is called by the runner after router selection (for modular kernels) - or with router logits (for monolithic kernels). It delegates to - quant_method.apply() which accesses the weights on this RoutedExperts - instance. - - Args: - x: Input tensor after any transforms - topk_weights: Routing weights from router (for modular kernels) - topk_ids: Selected expert IDs from router (for modular kernels) - router_logits: Router logits (for monolithic kernels) - shared_experts_input: Input for shared experts (if any) - - Returns: - Output tensor from routed experts - """ - quant_method = self.quant_method - - if quant_method.is_monolithic: - # Monolithic kernels handle routing internally - return quant_method.apply_monolithic( - layer=self, # Pass RoutedExperts as layer - x=x, - router_logits=router_logits, - ) - else: - # Modular kernels use pre-computed routing - return quant_method.apply( - layer=self, # Pass RoutedExperts as layer - x=x, - topk_weights=topk_weights, - topk_ids=topk_ids, - shared_experts_input=shared_experts_input, - ) - - -# Mark the RoutedExperts weight_loader as supporting MoE-specific parameters -RoutedExperts.weight_loader.supports_moe_loading = True # type: ignore[attr-defined] - - # --8<-- [start:fused_moe] @CustomOp.register("fused_moe") class FusedMoE(CustomOp): diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py new file mode 100644 index 000000000000..557300a5ef95 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -0,0 +1,742 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Iterable +from enum import Enum +from typing import Literal, overload + +import torch +from torch.nn.parameter import UninitializedParameter + +from vllm.config import get_current_vllm_config +from vllm.distributed import ( + get_dp_group, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, +) +from vllm.model_executor.layers.fused_moe.expert_map_manager import ( + ExpertMapManager, +) +from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( + FusedMoEMethodBase, +) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, +) + +logger = init_logger(__name__) + + +class FusedMoeWeightScaleSupported(Enum): + TENSOR = "tensor" + CHANNEL = "channel" + GROUP = "group" + BLOCK = "block" + + +class RoutedExperts(torch.nn.Module): + """ + Container for routed expert weights and execution logic. + + This module owns the expert weight parameters (w13_weight, w2_weight, scales, etc.) + and handles: + - Loading checkpoint weights into parameters + - Executing routed experts via quant_method.apply() + + Weight parameters are registered on this module via _ParameterRegistrationWrapper + during FusedMoE initialization. + """ + + def __init__( + self, + layer_name: str, + params_dtype: torch.dtype, + unpadded_hidden_size: int, # put in moe_config? + intermediate_size: int, + moe_config: FusedMoEConfig, + quant_config: QuantizationConfig | None, + quant_method: FusedMoEMethodBase, + expert_map_manager: ExpertMapManager, + **kwargs, + ): + super().__init__() + self.layer_name = layer_name + self.moe_config = moe_config + self.quant_config = quant_config + self.quant_method = quant_method + self.expert_map_manager = expert_map_manager + self.hidden_size = moe_config.hidden_dim + self.intermediate_size_per_partition = ( + moe_config.intermediate_size_per_partition + ) + self.global_num_experts = moe_config.num_experts + self.local_num_experts = moe_config.num_local_experts + + # Register buffers for state_dict compatibility + if self.expert_map_manager.expert_map is not None: + self.register_buffer("_expert_map", self.expert_map_manager.expert_map) + + if self.expert_map_manager.expert_mask is not None: + self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) + + # Bit of hack until things are settled + self.__dict__.update(kwargs) + + moe_quant_params = { + "num_experts": moe_config.num_local_experts, + "hidden_size": moe_config.hidden_dim, + "unpadded_hidden_size": unpadded_hidden_size, + "intermediate_size_per_partition": ( + moe_config.intermediate_size_per_partition + ), + "params_dtype": params_dtype, + "weight_loader": self.weight_loader, + "global_num_experts": moe_config.num_experts, + } + + # need full intermediate size pre-sharding for WNA16 act order + if self._needs_intermediate_size_param(quant_method): + moe_quant_params["intermediate_size_full"] = intermediate_size + + quant_method.create_weights(layer=self, **moe_quant_params) + + # TODO(bnell): make this a method on quant_method + def _needs_intermediate_size_param(self, quant_method: FusedMoEMethodBase) -> bool: + return quant_method.__class__.__name__ in ( + "GPTQMarlinMoEMethod", + "CompressedTensorsWNA16MarlinMoEMethod", + "CompressedTensorsWNA16MoEMethod", + ) + + def _ensure_moe_quant_config_init(self): + if self.quant_method.moe_quant_config is None: + # Note: the moe_quant_config can't be constructed until after + # weight loading post processing. + self.quant_method.moe_quant_config = ( + self.quant_method.get_fused_moe_quant_config(self) + ) + + @property + def expert_map(self) -> torch.Tensor | None: + return ( + self.expert_map_manager.expert_map + if not self.rocm_aiter_fmoe_enabled + else self.expert_map_manager.expert_mask + ) + + def _maybe_init_expert_routing_tables( + self, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: + """Get routing tables (already initialized by manager).""" + # Return routing tables from manager + routing_tables = self.expert_map_manager.routing_tables + + if routing_tables is None: + return None + + # Register buffers for backward compatibility if not already registered + if not hasattr(self, "expert_global_to_physical"): + global_to_physical, physical_to_global, local_global = routing_tables + self.register_buffer("expert_global_to_physical", global_to_physical) + self.register_buffer("expert_physical_to_global", physical_to_global) + self.register_buffer("expert_local_to_global", local_global) + + return routing_tables + + def update_expert_map(self): + """Update expert mappings for new EP configuration.""" + # ep_size and ep_rank should already be updated in moe_parallel_config + self.expert_map_manager.update() + + # Re-register buffers for state_dict compatibility + self.register_buffer("_expert_map", self.expert_map_manager.expert_map) + self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) + + # Update routing table buffers if needed + self._maybe_init_expert_routing_tables() + + # Handle AITER shared experts if needed + if self.aiter_fmoe_shared_expert_enabled: + self._init_aiter_shared_experts_topK_buffer( + vllm_config=get_current_vllm_config(), + dp_size=get_dp_group().world_size, + ) + + def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: + """Map global expert ID to local expert ID.""" + return self.expert_map_manager.map_global_to_local(expert_id) + + # + # Weight Loading Methods + # + + def _load_per_tensor_weight_scale( + self, + shard_id: str, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + expert_id: int, + ): + param_data = param.data + # for per tensor weight quantization + if shard_id in ("w1", "w3"): + # We have to keep the weight scales of w1 and w3 because + # we need to re-quantize w1/w3 weights after weight loading. + idx = 0 if shard_id == "w1" else 1 + param_data[expert_id][idx] = loaded_weight + # If we are in the row parallel case (down_proj) + elif shard_id == "w2": + param_data[expert_id] = loaded_weight + + def _load_combined_w13_weight_scale( + self, + shard_dim: int, + loaded_weight: torch.Tensor, + param: torch.Tensor, + tp_rank: int, + ): + """ + Load w13 weight scales assuming that w1 weight scales and w3 weight + scales are stored in the same loaded_weight tensor. + """ + shard_size = param.shape[shard_dim] + loaded_weight = loaded_weight.narrow( + shard_dim, shard_size * tp_rank, shard_size + ) + param.copy_(loaded_weight) + + def _load_model_weight_or_group_weight_scale( + self, + shard_dim: int, + expert_data: torch.Tensor, + shard_id: str, + loaded_weight: torch.Tensor, + tp_rank: int, + load_full_w2: bool = False, + ): + """ + Load grouped weight scales for group quantization or model weights + :param shard_dim: dimension to shard + :param expert_data: parameter for a particular expert + :param shard_id: either w1, w2, or w3 + :param loaded_weight: checkpoint weight to load into the param + :param tp_rank: tensor parallel rank + :param load_full_w2: whether or not the w2 loaded should be sharded. + """ + if shard_id == "w2": + # In the case where we have actorder/g_idx, we do not partition the + # w2 scales, as indicated by `load_full` argument, for all tp cases + self._load_w2( + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=tp_rank, + load_full=load_full_w2, + ) + elif shard_id in ("w1", "w3"): + self._load_w13( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=tp_rank, + ) + + def _load_per_channel_weight_scale( + self, + expert_data: torch.Tensor, + shard_dim: int, + shard_id: str, + loaded_weight: torch.Tensor, + tp_rank: int, + ): + # for per channel weight quantization + if shard_id == "w2": + expert_data.copy_(loaded_weight) + elif shard_id in ("w1", "w3"): + self._load_w13( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=tp_rank, + ) + + def _load_w13( + self, + expert_data: torch.Tensor, + shard_dim: int, + shard_id: str, + loaded_weight: torch.Tensor, + tp_rank: int, + load_full: bool = False, + ): + # Index the loaded weight for tp sharding. + # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim + if self.moe_config.is_act_and_mul: + shard_size = expert_data.shape[shard_dim] // 2 + else: + shard_size = expert_data.shape[shard_dim] + # Only narrow if the loaded_weight is not a scalar (0-dim tensor) + # and we're not loading the full weight + if not load_full and loaded_weight.ndim > 0: + loaded_weight = loaded_weight.narrow( + shard_dim, shard_size * tp_rank, shard_size + ) + # Narrow parameter and load. + # w1, gate_proj: Load into first logical weight of w13. + if shard_id == "w1": + expert_data = expert_data.narrow(shard_dim, 0, shard_size) + # w3, up_proj: Load into second logical weight of w13. + else: + assert shard_id == "w3" + expert_data = expert_data.narrow(shard_dim, shard_size, shard_size) + expert_data.copy_(loaded_weight) + + def _load_w2( + self, + expert_data: torch.Tensor, + shard_dim: int, + loaded_weight: torch.Tensor, + tp_rank: int, + load_full: bool = False, + ): + # Index the loaded weight for tp sharding. + # down_proj: "RowParallel" so tp sharding on input_dim + # Narrow parameter and load. + shard_size = expert_data.shape[shard_dim] + # Only narrow if the loaded_weight is not a scalar (0-dim tensor) + # and we're not loading the full weight + if not load_full and loaded_weight.ndim > 0: + loaded_weight = loaded_weight.narrow( + shard_dim, shard_size * tp_rank, shard_size + ) + # w2, down_proj: Load into only logical weight of w2. + expert_data.copy_(loaded_weight) + + def _load_single_value( + self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int + ): + param_data = param.data + + # Input scales can be loaded directly and should be equal. + param_data[expert_id] = loaded_weight + + def _load_g_idx( + self, + shard_id: str, + expert_data: torch.Tensor, + shard_dim: int, + loaded_weight: torch.Tensor, + tp_rank: int, + ): + if shard_id == "w2": + self._load_w2( + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=tp_rank, + ) + else: + assert shard_id in ("w1", "w3") + expert_data.copy_(loaded_weight) + + @overload + def weight_loader( + self, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, + shard_id: str, + expert_id: int, + return_success: Literal[False], + ) -> None: ... + + @overload + def weight_loader( + self, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, + shard_id: str, + expert_id: int, + return_success: Literal[True], + ) -> bool: ... + + def weight_loader( + self, + param: torch.nn.Parameter, + loaded_weight: torch.Tensor, + weight_name: str, + shard_id: str, + expert_id: int, + return_success: bool = False, + ) -> bool | None: + if self.quant_config and self.quant_config.get_name() == "mxfp4": + # (FIXME) for gpt-oss all experts are combined + if "bias" in weight_name: + dim1 = loaded_weight.shape[1] + param.data[:, :dim1].copy_(loaded_weight) + else: + dim1 = loaded_weight.shape[1] + dim2 = loaded_weight.shape[2] + param.data[:, :dim1, :dim2].copy_(loaded_weight) + return True if return_success else None + + quant_method_name = self.quant_method.__class__.__name__ + global_expert_id = expert_id + expert_id = self.layer._map_global_expert_id_to_local_expert_id( + global_expert_id + ) + + use_global_sf = ( + getattr(self.quant_method, "use_global_sf", False) + and "input_scale" in weight_name + ) + + if expert_id == -1 and not use_global_sf: + # Failed to load this param since it's not local to this rank + return False if return_success else None + # Hereafter, `expert_id` is local physical id + + # is_transposed: if the dim to shard the weight + # should be flipped. Required by GPTQ, compressed-tensors + # should be whatever dimension intermediate_size_per_partition is + is_transposed = getattr(param, "is_transposed", False) + + # compressed-tensors checkpoints with packed weights are stored flipped + # TODO (mgoin): check self.layer._runner.quant_method.quant_config.quant_format + # against known CompressionFormat enum values that have this quality + if quant_method_name in ( + "CompressedTensorsWNA16MarlinMoEMethod", + "CompressedTensorsWNA16MoEMethod", + ): + if is_transposed: + loaded_weight = loaded_weight.t().contiguous() + else: + loaded_weight = loaded_weight + + if shard_id not in ("w1", "w2", "w3"): + raise ValueError(f"shard_id must be ['w1','w2','w3'] but got {shard_id}.") + + # Fetch the dim to shard the parameter/loaded weight + # based on the shard id. This will be whatever + # dimension intermediate_size_per_partition is used. + SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0} + + is_gguf_weight = getattr(param, "is_gguf_weight", False) + is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False) + if is_gguf_weight_type: + param.weight_type = loaded_weight.item() + param.data.copy_(loaded_weight) + return True if return_success else None + + # Case for BitsAndBytes + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + if use_bitsandbytes_4bit: + shard_dim = 0 + + expert_data = param.data[expert_id] + if shard_id == "w2": + expert_data.copy_(loaded_weight) + elif shard_id in ("w1", "w3"): + # BNB inflight quantization has already sharded the weights + full_load = True + self._load_w13( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=self.moe_config.tp_rank, + load_full=full_load, + ) + return True if return_success else None + + shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id] + if is_transposed: + shard_dim = int(not shard_dim) + + full_load = len(loaded_weight.shape) == 3 + if full_load: + shard_dim += 1 + + # Materialize GGUF UninitializedParameter accounting merged weights + if is_gguf_weight and isinstance(param, UninitializedParameter): + # To materialize a tensor, we must have full shape including + # number of experts, making this portion to require `full_load`. + assert full_load + final_shape = list(loaded_weight.shape) + # w1 and w3 are merged per expert. + if shard_id in {"w1", "w3"}: + final_shape[1] *= 2 + final_shape[shard_dim] = final_shape[shard_dim] // self.moe_config.tp_size + param.materialize(final_shape, dtype=loaded_weight.dtype) + + expert_data = param.data if full_load else param.data[expert_id] + + # Case input scale: input_scale loading is only supported for fp8 + if "input_scale" in weight_name: + # this is needed for compressed-tensors only + loaded_weight = loaded_weight.to(param.data.device) + + if ( + "compressed" in quant_method_name.lower() + and param.data[expert_id] != 1 + and (param.data[expert_id] - loaded_weight).abs() > 1e-5 + ): + raise ValueError( + "input_scales of w1 and w3 of a layer " + f"must be equal. But got {param.data[expert_id]} " + f"vs. {loaded_weight}" + ) + + self._load_single_value( + param=param, + loaded_weight=loaded_weight, + expert_id=global_expert_id if use_global_sf else expert_id, + ) + return True if return_success else None + + # Case g_idx + if "g_idx" in weight_name: + self._load_g_idx( + shard_dim=0, + shard_id=shard_id, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=self.moe_config.tp_rank, + ) + return True if return_success else None + + # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern + if "ModelOpt" in quant_method_name: + # Determine per-tensor weight scale patterns based on variant + # Use the dedicated method instead of brittle string matching + uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern() + quant_method = getattr(param, "quant_method", None) + + # Call _load_per_tensor_weight_scale() to load per-tensor (scalar) + # weights scales. + # Input scales are always per-tensor. + # Weight scales: FP4 uses "weight_scale_2" and FP8 uses + # "weight_scale" for per-tensor scales. + # NOTE: ModelOpt MXFP8 MoE uses block scales in weight_scale + # tensors (quant_method=BLOCK), so those must not be treated + # as per-tensor scalars here. + is_block_weight_scale = ( + "weight_scale" in weight_name + and quant_method == FusedMoeWeightScaleSupported.BLOCK.value + ) + is_per_tensor = ( + "weight_scale_2" in weight_name + if uses_weight_scale_2 + else "weight_scale" in weight_name + ) or "input_scale" in weight_name + is_per_tensor = is_per_tensor and not is_block_weight_scale + if is_per_tensor: + self._load_per_tensor_weight_scale( + shard_id=shard_id, + param=param, + loaded_weight=loaded_weight, + expert_id=expert_id, + ) + return True if return_success else None + + # If the weight is w13_weight_scale and w13_weight_scales are + # combined into single loaded_weight, call + # _load_combined_w13_weight_scale() to load it. + # This is checked by comparing the hidden_out dims of the + # loaded_weight and the param. + if "w13_weight_scale" in weight_name: + loaded_weight_hidden_out = loaded_weight.shape[-2] + param_hidden_out = param.data.shape[-2] * self.moe_config.tp_size + if loaded_weight_hidden_out == param_hidden_out: + self._load_combined_w13_weight_scale( + shard_dim=shard_dim, + loaded_weight=loaded_weight, + param=expert_data, + tp_rank=self.moe_config.tp_rank, + ) + return True if return_success else None + + # For other weights, call _load_model_weight_or_group_weight_scale() + # to load it. + if "weight" in weight_name: + self._load_model_weight_or_group_weight_scale( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=self.moe_config.tp_rank, + ) + return True if return_success else None + + # Case weight scales, zero_points and offset, weight/input global scales + if "scale" in weight_name or "zero" in weight_name or "offset" in weight_name: + # load the weight scales and zp based on the quantization scheme + # supported weight scales/zp can be found in + # FusedMoeWeightScaleSupported + # TODO @dsikka: once hardened, refactor to use vLLM Parameters + # specific to each case + quant_method = getattr(param, "quant_method", None) + if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value: + self._load_per_channel_weight_scale( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=self.moe_config.tp_rank, + ) + elif quant_method in [ + FusedMoeWeightScaleSupported.GROUP.value, + FusedMoeWeightScaleSupported.BLOCK.value, + ]: + self._load_model_weight_or_group_weight_scale( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=self.moe_config.tp_rank, + load_full_w2=getattr(param, "load_full_w2", False), + ) + elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value: + self._load_per_tensor_weight_scale( + shard_id=shard_id, + param=param, + loaded_weight=loaded_weight, + expert_id=expert_id, + ) + else: + WEIGHT_SCALE_SUPPORTED = [e.value for e in FusedMoeWeightScaleSupported] + raise ValueError( + f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}" + ) + return True if return_success else None + + # Case weight_shape + if "weight_shape" in weight_name: + # only required by compressed-tensors + self._load_single_value( + param=param, loaded_weight=loaded_weight, expert_id=expert_id + ) + return True if return_success else None + + # Case model weights + if "weight" in weight_name: + self._load_model_weight_or_group_weight_scale( + shard_id=shard_id, + shard_dim=shard_dim, + loaded_weight=loaded_weight, + expert_data=expert_data, + tp_rank=self.moe_config.tp_rank, + ) + return True if return_success else None + + return False if return_success else None + + def load_weights( + self, weights: Iterable[tuple[str, torch.Tensor]] + ) -> Iterable[str]: + if (expert_mapping := self.layer.expert_mapping) is None: + raise ValueError( + "`self.layer.expert_mapping` must be provided to " + "load weights using `self.load_weights`." + ) + for expert_name, loaded_weight in weights: + qual_name = f"{self.layer_name}.{expert_name}" + for param_name, weight_name, expert_id, shard_id in expert_mapping: + if weight_name not in qual_name: + continue + weight_name = qual_name.replace(weight_name, param_name) + param_name = weight_name.removeprefix(f"{self.layer_name}.") + param = getattr(self, param_name) + # Fused expert weights can be identified by their 3D tensors + if loaded_weight.dim() == 3: + # Repurpose expert_id as shard_idx for deconcatenating w1 and w3 + if shard_id in {"w1", "w3"}: + shard_idx = expert_id + experts_shard = loaded_weight.chunk(2, dim=1)[shard_idx] + else: + experts_shard = loaded_weight + start = 0 + else: + # loaded_weight is a single expert weight, so we add a dummy expert + # dimension to unify the loading logic with the fused case + experts_shard = loaded_weight.unsqueeze(0) + start = expert_id + + # Unified loading logic for fused and non-fused experts + loaded_experts = experts_shard.unbind() + for expert_id, loaded_expert in enumerate(loaded_experts, start=start): + success = self.weight_loader( + param=param, + loaded_weight=loaded_expert, + weight_name=weight_name, + shard_id=shard_id, + expert_id=expert_id, + return_success=True, + ) + if success: + logger.debug( + "Loaded expert %d of shard %s into %s for layer %s", + expert_id, + shard_id, + param_name, + self.layer_name, + ) + yield param_name + + # + # Execution + # + + def forward( + self, + x: torch.Tensor, + topk_weights: torch.Tensor | None = None, + topk_ids: torch.Tensor | None = None, + router_logits: torch.Tensor | None = None, + shared_experts_input: torch.Tensor | None = None, + ) -> torch.Tensor: + """ + Execute routed experts using the quantization method's apply function. + + This is called by the runner after router selection (for modular kernels) + or with router logits (for monolithic kernels). It delegates to + quant_method.apply() which accesses the weights on this RoutedExperts + instance. + + Args: + x: Input tensor after any transforms + topk_weights: Routing weights from router (for modular kernels) + topk_ids: Selected expert IDs from router (for modular kernels) + router_logits: Router logits (for monolithic kernels) + shared_experts_input: Input for shared experts (if any) + + Returns: + Output tensor from routed experts + """ + quant_method = self.quant_method + + if quant_method.is_monolithic: + # Monolithic kernels handle routing internally + return quant_method.apply_monolithic( + layer=self, # Pass RoutedExperts as layer + x=x, + router_logits=router_logits, + ) + else: + # Modular kernels use pre-computed routing + return quant_method.apply( + layer=self, # Pass RoutedExperts as layer + x=x, + topk_weights=topk_weights, + topk_ids=topk_ids, + shared_experts_input=shared_experts_input, + ) + + +# Mark the RoutedExperts weight_loader as supporting MoE-specific parameters +RoutedExperts.weight_loader.supports_moe_loading = True # type: ignore[attr-defined] diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 3244aa8c10ad..3cf3116f0670 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -8,7 +8,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.layer import FusedMoE, RoutedExperts +from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index c7a1731fdd4f..3bc4a3accbad 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -14,18 +14,18 @@ MPLinearLayerConfig, choose_mp_linear_kernel, ) -from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEConfig, - FusedMoEQuantConfig, -) -from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe -from vllm.model_executor.layers.fused_moe.layer import ( +from vllm.model_executor.layers.fused_moe import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, RoutedExperts, UnquantizedFusedMoEMethod, ) +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + FusedMoEQuantConfig, +) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index b4529d9485d1..4fcc468c6cfb 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -24,7 +24,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE, RoutedExperts +from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index bf194c29c1d0..154347a930a9 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -12,7 +12,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.layer import FusedMoE, RoutedExperts +from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.linear import LinearMethodBase from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 528f58fd4f75..79326c2410c4 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -14,18 +14,18 @@ MPLinearLayerConfig, choose_mp_linear_kernel, ) -from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEConfig, - FusedMoEQuantConfig, -) -from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe -from vllm.model_executor.layers.fused_moe.layer import ( +from vllm.model_executor.layers.fused_moe import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, RoutedExperts, UnquantizedFusedMoEMethod, ) +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + FusedMoEQuantConfig, +) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 23543af51db8..b4a53c8c3110 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -11,6 +11,11 @@ from vllm.logger import init_logger from vllm.model_executor.kernels.linear import init_fp8_linear_kernel from vllm.model_executor.layers.attention import Attention, MLAAttention +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + FusedMoeWeightScaleSupported, + RoutedExperts, +) from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, @@ -20,11 +25,6 @@ from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, ) -from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, - FusedMoeWeightScaleSupported, - RoutedExperts, -) from vllm.model_executor.layers.fused_moe.oracle.fp8 import ( Fp8MoeBackend, convert_to_fp8_moe_kernel_format, diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index acd597c72fae..d1073d23b3bb 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -6,19 +6,19 @@ import torch from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group -from vllm.model_executor.layers.fused_moe.activation import MoEActivation -from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEQuantConfig, - int4_w4a16_moe_quant_config, - int8_w8a16_moe_quant_config, -) -from vllm.model_executor.layers.fused_moe.layer import ( +from vllm.model_executor.layers.fused_moe import ( FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported, RoutedExperts, ) +from vllm.model_executor.layers.fused_moe.activation import MoEActivation +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, + int4_w4a16_moe_quant_config, + int8_w8a16_moe_quant_config, +) from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( UnquantizedFusedMoEMethod, ) diff --git a/vllm/model_executor/layers/quantization/mxfp8.py b/vllm/model_executor/layers/quantization/mxfp8.py index 9156e5539228..5b4564bea31c 100644 --- a/vllm/model_executor/layers/quantization/mxfp8.py +++ b/vllm/model_executor/layers/quantization/mxfp8.py @@ -13,7 +13,6 @@ from vllm.model_executor.layers.fused_moe import ( FusedMoE, FusedMoEMethodBase, - RoutedExperts, ) from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod from vllm.model_executor.layers.fused_moe.oracle.mxfp8 import ( diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index a06ec65e8115..78c64bac6187 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -8,7 +8,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE, RoutedExperts +from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, From b32656458247edd12b48303ef85a6b5a829acade Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 27 Mar 2026 19:03:32 +0000 Subject: [PATCH 090/191] CHECKPOINT WIP moving weight loading to RoutedExperts Signed-off-by: Bill Nell --- .../layers/fused_moe/fused_moe_method_base.py | 16 ++- .../fused_moe/fused_moe_modular_method.py | 12 +- vllm/model_executor/layers/fused_moe/layer.py | 120 ++++++++---------- .../layers/fused_moe/routed_experts.py | 34 ++++- .../fused_moe/runner/chunking_moe_runner.py | 4 +- .../fused_moe/runner/default_moe_runner.py | 16 +-- .../layers/fused_moe/runner/moe_runner.py | 7 +- .../fused_moe/runner/moe_runner_base.py | 37 +++--- .../fused_moe/runner/moe_runner_factory.py | 16 +-- .../fused_moe/unquantized_fused_moe_method.py | 24 ++-- .../model_executor/layers/quantization/awq.py | 4 +- .../layers/quantization/awq_marlin.py | 11 +- .../layers/quantization/bitsandbytes.py | 7 +- .../compressed_tensors/compressed_tensors.py | 12 +- .../compressed_tensors_moe.py | 54 ++++---- .../layers/quantization/experts_int8.py | 5 +- .../model_executor/layers/quantization/fp8.py | 39 +++--- .../layers/quantization/gguf.py | 7 +- .../layers/quantization/gptq.py | 4 +- .../layers/quantization/gptq_marlin.py | 42 +++--- .../model_executor/layers/quantization/inc.py | 11 +- .../layers/quantization/modelopt.py | 29 ++--- .../layers/quantization/moe_wna16.py | 9 +- .../layers/quantization/mxfp4.py | 11 +- .../layers/quantization/mxfp8.py | 20 +-- .../layers/quantization/quark/quark.py | 4 +- .../layers/quantization/quark/quark_moe.py | 24 ++-- .../layers/quantization/utils/marlin_utils.py | 5 +- 28 files changed, 297 insertions(+), 287 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py index b5fd754a387d..82ad9e812651 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import abstractmethod +from typing import TYPE_CHECKING import torch @@ -19,6 +20,11 @@ QuantizeMethodBase, ) +if TYPE_CHECKING: + from vllm.model_executor.layers.fused_moe.routed_experts import ( + RoutedExperts, + ) + logger = init_logger(__name__) @@ -44,7 +50,7 @@ def mk_owns_shared_expert(self) -> bool: @abstractmethod def create_weights( self, - layer: torch.nn.Module, + layer: "RoutedExperts", num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -78,7 +84,7 @@ def maybe_make_prepare_finalize( def select_gemm_impl( self, prepare_finalize: FusedMoEPrepareAndFinalizeModular, - layer: torch.nn.Module, + layer: "RoutedExperts", ) -> FusedMoEExpertsModular: # based on the all2all implementation, select the appropriate # gemm implementation @@ -89,7 +95,7 @@ def select_gemm_impl( @abstractmethod def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: "RoutedExperts" ) -> FusedMoEQuantConfig | None: raise NotImplementedError @@ -123,7 +129,7 @@ def is_monolithic(self) -> bool: def apply( self, - layer: "RoutedExperts", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -146,7 +152,7 @@ def apply( def apply_monolithic( self, - layer: "RoutedExperts", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor: diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index 142e180786c6..ac583671c371 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import TYPE_CHECKING import torch @@ -20,6 +21,11 @@ SharedExperts, ) +if TYPE_CHECKING: + from vllm.model_executor.layers.fused_moe.routed_experts import ( + RoutedExperts, + ) + logger = init_logger(__name__) @@ -70,7 +76,7 @@ def method_name(self) -> str: def create_weights( self, - layer: torch.nn.Module, + layer: "RoutedExperts", num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -80,13 +86,13 @@ def create_weights( raise NotImplementedError def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: "RoutedExperts" ) -> FusedMoEQuantConfig | None: return self.moe_quant_config def apply( self, - layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index c567a1237b98..95a647faa3a8 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -49,9 +49,6 @@ from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( SharedExperts, ) -from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( - UnquantizedFusedMoEMethod, -) from vllm.model_executor.layers.fused_moe.utils import ( disable_inplace, ) @@ -137,6 +134,19 @@ def maybe_roundup_hidden_size( return hidden_size +def register_layer_for_moe_forward_op( + vllm_config: VllmConfig, + layer: torch.nn.Module, +): + # For smuggling this layer into the fused moe custom op + prefix = layer.layer_name + compilation_config = vllm_config.compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError("Duplicate layer name: {}".format(prefix)) + compilation_config.static_forward_context[prefix] = layer + compilation_config.static_all_moe_layers.append(prefix) + + # --8<-- [start:fused_moe] @CustomOp.register("fused_moe") class FusedMoE(CustomOp): @@ -203,6 +213,9 @@ def __init__( ): super().__init__() + # IMPORTANT: RoutedExperts must have same layer_name/prefix as FusedMoE for now + self.layer_name = prefix + if params_dtype is None: params_dtype = torch.get_default_dtype() self.params_dtype = params_dtype @@ -251,13 +264,6 @@ def __init__( # Expert mapping used in self.load_weights self.expert_mapping = expert_mapping - # For smuggling this layer into the fused moe custom op - compilation_config = vllm_config.compilation_config - if prefix in compilation_config.static_forward_context: - raise ValueError("Duplicate layer name: {}".format(prefix)) - compilation_config.static_forward_context[prefix] = self - compilation_config.static_all_moe_layers.append(prefix) - self.layer_name = prefix expert_placement_strategy: ExpertPlacementStrategy = ( vllm_config.parallel_config.expert_placement_strategy ) @@ -430,7 +436,7 @@ def __init__( ) self.hidden_size = hidden_size - self.moe_config: FusedMoEConfig = FusedMoEConfig( + self.moe_config = FusedMoEConfig( num_experts=self.global_num_experts, experts_per_token=top_k, hidden_dim=hidden_size, @@ -466,23 +472,40 @@ def __init__( logger.debug("FusedMoEConfig = %s", self.moe_config) - quant_method = self._get_quant_method( - prefix, - quant_config, - self.moe_config, - ) - - # TODO(bnell): only for weight loading. how to get around this? - self.quant_method = quant_method - # Move XXXXXXXXXXXXX if not self.moe_config.is_act_and_mul and not current_platform.is_cuda_alike(): raise NotImplementedError( "is_act_and_mul=False is supported only for CUDA and ROCm for now" ) + # Create RoutedExperts instance BEFORE create_weights() + # This will hold all expert weight parameters + self.routed_experts = RoutedExperts( + self.layer_name, + params_dtype, + unpadded_hidden_size, + intermediate_size, + self.moe_config, + self.quant_config, + expert_map_manager=self.expert_map_manager, + # Extra params that are needed by quant_methods, pass along for now + rocm_aiter_fmoe_enabled=self.rocm_aiter_fmoe_enabled, + use_grouped_topk=use_grouped_topk, + num_expert_group=num_expert_group, + topk_group=topk_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + apply_router_weight_on_input=apply_router_weight_on_input, + activation=MoEActivation.from_str(activation), + ) + + # HACK + self.quant_method = self.routed_experts.quant_method + # Move XXXXXXXXXXXXX - if eplb_manager is not None and not quant_method.supports_eplb: + if eplb_manager is not None and not self.quant_method.supports_eplb: # TODO: Add support for additional quantization methods. # The implementation for other quantization methods does not # contain essential differences, but the current quant API @@ -491,51 +514,31 @@ def __init__( # If you plan to add support for more quantization methods, # please refer to the implementation in `Fp8MoEMethod`. raise NotImplementedError( - f"EPLB is not supported {quant_method.__class__.__name__}." + f"EPLB is not supported {self.quant_method.__class__.__name__}." ) # Storing the runner in the FusedMoE is an intermediate state, eventually # the runner will own the FusedMoE layer and provide the execution interface # for MoE ops. self._runner = create_moe_runner( - layer=self, + layer_name=self.layer_name, moe_config=self.moe_config, router=router, routed_input_transform=routed_input_transform, routed_output_transform=routed_output_transform, gate=gate, shared_experts=shared_experts, - quant_method=quant_method, + routed_experts=self.routed_experts, enable_dbo=vllm_config.parallel_config.enable_dbo, apply_scale_to_output=apply_scale_to_output, routed_scaling_factor=routed_scaling_factor, ) - # Create RoutedExperts instance BEFORE create_weights() - # This will hold all expert weight parameters - self.routed_experts = RoutedExperts( - self.layer_name, - params_dtype, - unpadded_hidden_size, - intermediate_size, - self.moe_config, - self.quant_config, - self.quant_method, - expert_map_manager=self.expert_map_manager, - # Extra params that are needed by quant_methods, pass along for now - rocm_aiter_fmoe_enabled=self.rocm_aiter_fmoe_enabled, - use_grouped_topk=use_grouped_topk, - num_expert_group=num_expert_group, - topk_group=topk_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - apply_router_weight_on_input=apply_router_weight_on_input, - activation=MoEActivation.from_str(activation), - # XXXXXXXXXXXXXXXXXXXXXXXX - shared_experts=self._runner.shared_experts, - ) + # HACK XXXXXXXXXXXXXXXXXXXXXXXX + self.routed_experts.shared_experts = self._runner.shared_experts + + # For smuggling this layer into the fused moe custom op + register_layer_for_moe_forward_op(vllm_config, self) def extra_repr(self) -> str: s = ( @@ -549,31 +552,12 @@ def extra_repr(self) -> str: return s - def _get_quant_method( - self, - prefix: str, - quant_config: QuantizationConfig | None, - moe_config: FusedMoEConfig, - ) -> FusedMoEMethodBase: - """ - Helper method to ensure quant_method is never None and - of the proper type. - """ - quant_method = None - if quant_config is not None: - quant_method = quant_config.get_quant_method(self, prefix) - if quant_method is None: - quant_method = UnquantizedFusedMoEMethod(moe_config) - assert isinstance(quant_method, FusedMoEMethodBase) - return quant_method - # TODO(bnell): This method is provided as a hook so vllm/lora/layers/fused_moe.py # and vllm/distributed/elastic_ep/elastic_execute.py # can safely swap out the quant_method. We should figure out a less # intrusive way to do this. def _replace_quant_method(self, mk: FusedMoEMethodBase): self._runner._replace_quant_method(mk) - self.routed_experts.quant_method = mk # def _ensure_moe_quant_config_init(self): # if self._runner.quant_method.moe_quant_config is None: diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index 557300a5ef95..d8b427276576 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -22,6 +22,9 @@ from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, ) +from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( + UnquantizedFusedMoEMethod, +) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, ) @@ -57,7 +60,6 @@ def __init__( intermediate_size: int, moe_config: FusedMoEConfig, quant_config: QuantizationConfig | None, - quant_method: FusedMoEMethodBase, expert_map_manager: ExpertMapManager, **kwargs, ): @@ -65,7 +67,6 @@ def __init__( self.layer_name = layer_name self.moe_config = moe_config self.quant_config = quant_config - self.quant_method = quant_method self.expert_map_manager = expert_map_manager self.hidden_size = moe_config.hidden_dim self.intermediate_size_per_partition = ( @@ -84,6 +85,12 @@ def __init__( # Bit of hack until things are settled self.__dict__.update(kwargs) + self.quant_method = self._get_quant_method( + self.layer_name, + self.quant_config, + self.moe_config, + ) + moe_quant_params = { "num_experts": moe_config.num_local_experts, "hidden_size": moe_config.hidden_dim, @@ -97,10 +104,28 @@ def __init__( } # need full intermediate size pre-sharding for WNA16 act order - if self._needs_intermediate_size_param(quant_method): + if self._needs_intermediate_size_param(self.quant_method): moe_quant_params["intermediate_size_full"] = intermediate_size - quant_method.create_weights(layer=self, **moe_quant_params) + self.quant_method.create_weights(layer=self, **moe_quant_params) + + def _get_quant_method( + self, + prefix: str, + quant_config: QuantizationConfig | None, + moe_config: FusedMoEConfig, + ) -> FusedMoEMethodBase: + """ + Helper method to ensure quant_method is never None and + of the proper type. + """ + quant_method = None + if quant_config is not None: + quant_method = quant_config.get_quant_method(self, prefix) + if quant_method is None: + quant_method = UnquantizedFusedMoEMethod(moe_config) + assert isinstance(quant_method, FusedMoEMethodBase) + return quant_method # TODO(bnell): make this a method on quant_method def _needs_intermediate_size_param(self, quant_method: FusedMoEMethodBase) -> bool: @@ -692,6 +717,7 @@ def load_weights( # Execution # + # TODO: split this def forward( self, x: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py index bdae155aab54..c0adeddde284 100644 --- a/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py @@ -63,7 +63,7 @@ def __init__(self, inner: MoERunnerBase, **kwargs): # return getattr(self._inner, name) def _replace_quant_method(self, quant_method: FusedMoEMethodBase): - self._quant_method = quant_method + self.routed_experts.quant_method = quant_method self._inner._replace_quant_method(quant_method) assert self._shared_experts == self._inner._shared_experts @@ -136,7 +136,6 @@ def _slice_and_copy_input( def _forward_impl( self, - layer: torch.nn.Module, hidden_states: torch.Tensor, router_logits: torch.Tensor, shared_experts_input: torch.Tensor | None, @@ -194,7 +193,6 @@ def _forward_impl( # Delegate per-chunk computation to the inner runner. chunk_result = self._inner._forward_impl( - layer=layer, hidden_states=hidden_states_chunk, router_logits=router_logits_chunk, shared_experts_input=shared_experts_input_chunk, diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index 805eeb266068..3d22c7dbb756 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -11,8 +11,8 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, ) -from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( - FusedMoEMethodBase, +from vllm.model_executor.layers.fused_moe.routed_experts import ( + RoutedExperts, ) from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( FusedMoERouter, @@ -51,26 +51,26 @@ class DefaultMoERunner(MoERunnerBase): def __init__( self, - layer: torch.nn.Module, + layer_name: str, moe_config: FusedMoEConfig, router: FusedMoERouter, routed_input_transform: torch.nn.Module | None, gate: torch.nn.Module | None, shared_experts: torch.nn.Module | None, - quant_method: FusedMoEMethodBase, + routed_experts: RoutedExperts, enable_dbo: bool, routed_output_transform: torch.nn.Module | None = None, apply_scale_to_output: bool = False, routed_scaling_factor: float = 1.0, ): super().__init__( - layer, + layer_name, moe_config, router, routed_input_transform, gate, shared_experts, - quant_method, + routed_experts, enable_dbo, routed_output_transform=routed_output_transform, apply_scale_to_output=apply_scale_to_output, @@ -85,7 +85,6 @@ def do_naive_dispatch_combine(self) -> bool: def _maybe_dispatch( self, - layer: torch.nn.Module, hidden_states: torch.Tensor, router_logits: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: @@ -139,7 +138,6 @@ def _maybe_combine( def _forward_impl( self, - layer: torch.nn.Module, hidden_states: torch.Tensor, router_logits: torch.Tensor, shared_experts_input: torch.Tensor | None, @@ -148,13 +146,11 @@ def _forward_impl( # #32567 lands and the remaining kernels are made MKs. The PCP # code will probably remain hidden_states, router_logits = self._maybe_dispatch( - layer, hidden_states, router_logits, ) shared_output, hidden_states = self._apply_quant_method( - layer=layer, hidden_states=hidden_states, router_logits=router_logits, shared_experts_input=shared_experts_input, diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index 16b5f540399a..24105709e8a1 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -44,11 +44,12 @@ def quant_method(self) -> FusedMoEMethodBase: def shared_experts(self) -> SharedExperts | None: raise NotImplementedError + @property @abstractmethod - def _replace_quant_method(self, quant_method: FusedMoEMethodBase): + def is_internal_router(self) -> bool: raise NotImplementedError - @property + # Temporary hack @abstractmethod - def is_internal_router(self) -> bool: + def _replace_quant_method(self, quant_method: FusedMoEMethodBase): raise NotImplementedError diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index dd2adcb6d5b5..63d4ac418fda 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -23,6 +23,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, ) +from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( FusedMoERouter, ) @@ -44,7 +45,7 @@ logger = init_logger(__name__) -def get_layer_from_name(layer_name: str) -> torch.nn.Module: +def get_layer_from_name(layer_name: str) -> torch.nn.Module: # FusedMoE forward_context: ForwardContext = get_forward_context() if layer_name == "from_forward_context": all_moe_layers = forward_context.all_moe_layers @@ -58,7 +59,9 @@ def get_layer_from_name(layer_name: str) -> torch.nn.Module: ) layer_name = all_moe_layers[moe_layer_index] forward_context.moe_layer_index += 1 - return forward_context.no_compile_layers[layer_name] + layer = forward_context.no_compile_layers[layer_name] + # assert isinstance(layer, FusedMoE) + return layer # On torch >= 2.11, layer_name is a hoisted ModuleName opaque object; @@ -88,7 +91,6 @@ def _moe_forward( ) -> torch.Tensor: layer = get_layer_from_name(_resolve_layer_name(layer_name)) return layer._runner._forward_dispatch( - layer.routed_experts, hidden_states, router_logits, shared_experts_input, @@ -112,7 +114,6 @@ def _moe_forward_shared( ) -> tuple[torch.Tensor, torch.Tensor]: layer = get_layer_from_name(_resolve_layer_name(layer_name)) return layer._runner._forward_dispatch( - layer.routed_experts, hidden_states, router_logits, shared_experts_input, @@ -186,13 +187,13 @@ class MoERunnerBase(MoERunner): def __init__( self, - layer: torch.nn.Module, + layer_name: str, moe_config: FusedMoEConfig, router: FusedMoERouter, routed_input_transform: torch.nn.Module | None, gate: torch.nn.Module | None, shared_experts: torch.nn.Module | None, - quant_method: FusedMoEMethodBase, + routed_experts: RoutedExperts, enable_dbo: bool, routed_output_transform: torch.nn.Module | None = None, apply_scale_to_output: bool = False, @@ -209,9 +210,9 @@ def __init__( self._shared_experts = SharedExperts( shared_experts, moe_config=moe_config, - mk_owns_shared_expert=quant_method.mk_owns_shared_expert, # ? + mk_owns_shared_expert=routed_experts.quant_method.mk_owns_shared_expert, ) - self._quant_method = quant_method + self.routed_experts = routed_experts self.enable_dbo = enable_dbo self.enable_eplb = moe_config.moe_parallel_config.enable_eplb self.apply_scale_to_output = ( @@ -220,9 +221,9 @@ def __init__( self.routed_scaling_factor = routed_scaling_factor # Needed for string -> FusedMoE layer lookup in custom ops. - self.layer_name = layer.layer_name + self.layer_name = layer_name - self._forward_entry = self._select_forward(layer) + self._forward_entry = self._select_forward() @property def is_internal_router(self) -> bool: @@ -230,7 +231,7 @@ def is_internal_router(self) -> bool: @property def quant_method(self) -> FusedMoEMethodBase: - return self._quant_method + return self.routed_experts.quant_method @property def shared_experts(self) -> SharedExperts | None: @@ -238,13 +239,13 @@ def shared_experts(self) -> SharedExperts | None: # TODO(bnell): Temporary hack. Get rid of this. def _replace_quant_method(self, quant_method: FusedMoEMethodBase): - self._quant_method = quant_method + self.routed_experts.quant_method = quant_method if self._shared_experts is not None: self._shared_experts._mk_owns_shared_expert = ( quant_method.mk_owns_shared_expert ) - def _select_forward(self, layer: torch.nn.Module) -> Callable: + def _select_forward(self) -> Callable: if current_platform.is_tpu() or current_platform.is_cpu(): # TODO: Once the OOM issue for the TPU backend is resolved, we # will switch to using the moe_forward custom op. @@ -446,7 +447,6 @@ def _maybe_apply_shared_experts( def _apply_quant_method( self, - layer: torch.nn.Module, hidden_states: torch.Tensor, router_logits: torch.Tensor, shared_experts_input: torch.Tensor | None, @@ -465,7 +465,7 @@ def _apply_quant_method( if self.quant_method.is_monolithic: # Monolithic kernels: pass router_logits to routed_experts - fused_out = layer.forward( + fused_out = self.routed_experts.forward( x=hidden_states, router_logits=router_logits, ) @@ -476,7 +476,7 @@ def _apply_quant_method( router_logits=router_logits, ) - fused_out = layer.forward( + fused_out = self.routed_experts.forward( x=hidden_states, topk_weights=topk_weights, topk_ids=topk_ids, @@ -643,7 +643,6 @@ def forward( def _forward_dispatch( self, - layer: torch.nn.Module, hidden_states: torch.Tensor, router_logits: torch.Tensor, shared_experts_input: torch.Tensor | None, @@ -655,11 +654,10 @@ def _forward_dispatch( the sequence-parallel context. """ # TODO(bnell): this can be removed after MK migration is complete. - layer._ensure_moe_quant_config_init() + self.routed_experts._ensure_moe_quant_config_init() with self._sequence_parallel_context(): return self._forward_impl( - layer, hidden_states, router_logits, shared_experts_input, @@ -668,7 +666,6 @@ def _forward_dispatch( @abstractmethod def _forward_impl( self, - layer: torch.nn.Module, hidden_states: torch.Tensor, router_logits: torch.Tensor, shared_experts_input: torch.Tensor | None, diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py index c31aec16653e..f5a43991e17e 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py @@ -6,8 +6,8 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, ) -from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( - FusedMoEMethodBase, +from vllm.model_executor.layers.fused_moe.routed_experts import ( + RoutedExperts, ) from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( FusedMoERouter, @@ -22,26 +22,26 @@ def create_moe_runner( - layer: torch.nn.Module, + layer_name: str, moe_config: FusedMoEConfig, router: FusedMoERouter, routed_input_transform: torch.nn.Module | None, gate: torch.nn.Module | None, shared_experts: torch.nn.Module | None, - quant_method: FusedMoEMethodBase, + routed_experts: RoutedExperts, enable_dbo: bool, routed_output_transform: torch.nn.Module | None = None, apply_scale_to_output: bool = False, routed_scaling_factor: float = 1.0, ) -> MoERunner: runner = DefaultMoERunner( - layer, + layer_name, moe_config, router, routed_input_transform, gate, shared_experts, - quant_method, + routed_experts, enable_dbo, routed_output_transform=routed_output_transform, apply_scale_to_output=apply_scale_to_output, @@ -50,13 +50,13 @@ def create_moe_runner( if moe_config.moe_parallel_config.use_dp_chunking: return ChunkingMoERunner( inner=runner, - layer=layer, + layer_name=layer_name, moe_config=moe_config, router=router, routed_input_transform=routed_input_transform, gate=gate, shared_experts=shared_experts, - quant_method=quant_method, + routed_experts=routed_experts, enable_dbo=enable_dbo, routed_output_transform=routed_output_transform, apply_scale_to_output=apply_scale_to_output, diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 1b3e40cc6efb..a53420ebb8f4 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable +from typing import TYPE_CHECKING import torch import torch.nn.functional as F @@ -27,6 +28,11 @@ FusedMoEExpertsModular, FusedMoEPrepareAndFinalizeModular, ) + +if TYPE_CHECKING: + from vllm.model_executor.layers.fused_moe.routed_experts import ( + RoutedExperts, + ) from vllm.model_executor.layers.fused_moe.oracle.unquantized import ( UnquantizedMoeBackend, convert_to_unquantized_kernel_format, @@ -88,7 +94,7 @@ def _select_monolithic(self) -> Callable: def forward_native( self, - layer: "RoutedExperts", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -116,7 +122,7 @@ def maybe_make_prepare_finalize( def select_gemm_impl( self, prepare_finalize: FusedMoEPrepareAndFinalizeModular, - layer: torch.nn.Module, + layer: "RoutedExperts", ) -> FusedMoEExpertsModular: assert self.moe_quant_config is not None if ( @@ -139,7 +145,7 @@ def select_gemm_impl( def create_weights( self, - layer: torch.nn.Module, + layer: "RoutedExperts", num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -230,7 +236,7 @@ def _setup_kernel( moe_config=self.moe, ) - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: "RoutedExperts") -> None: super().process_weights_after_loading(layer) # Padding the weight for better performance on ROCm @@ -292,7 +298,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: def apply( self, - layer: "RoutedExperts", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -306,7 +312,7 @@ def apply( shared_experts_input=shared_experts_input, ) - def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig: + def get_fused_moe_quant_config(self, layer: "RoutedExperts") -> FusedMoEQuantConfig: if self.moe.has_bias: return biased_moe_quant_config( layer.w13_bias, @@ -317,7 +323,7 @@ def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantCon def forward_cuda( self, - layer: "RoutedExperts", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -340,7 +346,7 @@ def forward_cuda( def forward_monolithic_cuda( self, - layer: "RoutedExperts", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor: @@ -366,7 +372,7 @@ def forward_monolithic_cuda( def forward_monolithic_cpu( self, - layer: "RoutedExperts", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor: diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 3cf3116f0670..be2494f64075 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -8,7 +8,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.layer import FusedMoE +from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, @@ -104,7 +104,7 @@ def get_quant_method( ): return UnquantizedLinearMethod() return AWQLinearMethod(self) - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): # Lazy import to avoid circular import. from .awq_marlin import AWQMarlinConfig from .moe_wna16 import MoeWNA16Config diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 3bc4a3accbad..1e14f715f609 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -15,7 +15,6 @@ choose_mp_linear_kernel, ) from vllm.model_executor.layers.fused_moe import ( - FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, RoutedExperts, @@ -280,7 +279,7 @@ def get_quant_method( quant_method = AWQMarlinLinearMethod(self) quant_method.input_dtype = get_marlin_input_dtype(prefix) return quant_method - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config if is_layer_skipped( @@ -496,7 +495,7 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -595,7 +594,7 @@ def create_weights( device = layer.w13_qweight.device layer.workspace = marlin_make_workspace_new(device, 4) - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: num_experts = layer.w13_qweight.shape[0] device = layer.w13_qweight.device is_a_8bit = self.input_dtype is not None and self.input_dtype.itemsize == 1 @@ -712,7 +711,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.w2_bias.data = marlin_permute_bias(layer.w2_bias) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: from vllm.model_executor.layers.fused_moe.config import ( awq_marlin_moe_quant_config, @@ -736,7 +735,7 @@ def get_fused_moe_quant_config( def select_gemm_impl( self, prepare_finalize, - layer: torch.nn.Module, + layer: RoutedExperts, ): """ Select the GEMM implementation for AWQ-Marlin MoE. diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 9deaf7cc08e3..6aa4af85ae4f 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -11,7 +11,6 @@ FusedMoEQuantConfig, ) from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, FusedMoEMethodBase, RoutedExperts, ) @@ -165,7 +164,7 @@ def get_quant_method( if is_layer_skipped_bnb(prefix, self.llm_int8_skip_modules): return UnquantizedLinearMethod() return BitsAndBytesLinearMethod(self) - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): return BitsAndBytesMoEMethod(self, layer.moe_config) return None @@ -452,7 +451,7 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -473,7 +472,7 @@ def create_weights( ) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: return None diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 4fcc468c6cfb..c9c9afcf98a6 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -24,7 +24,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import RoutedExperts from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, @@ -181,25 +181,25 @@ def get_quant_method( if isinstance(layer, Attention): return CompressedTensorsKVCacheMethod(self) - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): return CompressedTensorsMoEMethod.get_moe_method( self, layer, layer_name=prefix ) return None - def _add_fused_moe_to_target_scheme_map(self): + def _add_fused_moe_to_target_scheme_map(self): # XXXXXXXXXXXXXXXXXXXXXX """ Helper function to update target_scheme_map since linear layers get fused into FusedMoE targeting 'Linear' needs to also match - FusedMoE modules. + RoutedExperts modules. """ if ( "Linear" not in self.target_scheme_map - or "FusedMoE" in self.target_scheme_map + or "RoutedExperts" in self.target_scheme_map ): return - self.target_scheme_map["FusedMoE"] = self.target_scheme_map["Linear"] + self.target_scheme_map["RoutedExperts"] = self.target_scheme_map["Linear"] @classmethod def from_config(cls, config: dict[str, Any]) -> "CompressedTensorsConfig": diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 3bc4c3b85a52..e0747c89ce41 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -120,7 +120,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): @staticmethod def get_moe_method( quant_config: "CompressedTensorsConfig", # type: ignore # noqa E501 - layer: torch.nn.Module, + layer: RoutedExperts, layer_name: str, ) -> FusedMoEMethodBase: # RoutedExperts was made by combining multiple Linears so need to @@ -175,7 +175,7 @@ def get_moe_method( # Prefer to use the MarlinMoE kernel when it is supported. if ( - not check_moe_marlin_supports_layer(layer, group_size) + not check_moe_marlin_supports_layer(layer.routed_experts, group_size) or current_platform.is_rocm() ): if ( @@ -243,7 +243,7 @@ def __init__(self, moe): def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -310,7 +310,7 @@ def create_weights( set_weight_attrs(w2_weight_scale, extra_weight_attrs) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: return make_mxfp4_moe_quant_config( mxfp4_backend=self.mxfp4_backend, @@ -394,7 +394,7 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -587,7 +587,7 @@ def maybe_make_prepare_finalize( "logic. This function should not be called." ) - def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig: + def get_fused_moe_quant_config(self, layer: RoutedExperts) -> FusedMoEQuantConfig: return make_nvfp4_moe_quant_config( backend=self.nvfp4_backend, w13_scale=layer.w13_weight_scale, @@ -708,7 +708,7 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -947,7 +947,7 @@ def maybe_make_prepare_finalize( "logic. This function should not be called." ) - def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig: + def get_fused_moe_quant_config(self, layer: RoutedExperts) -> FusedMoEQuantConfig: is_per_token = self.input_quant.strategy == QuantizationStrategy.TOKEN return make_fp8_moe_quant_config( fp8_backend=self.fp8_backend, @@ -1044,7 +1044,7 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -1108,11 +1108,11 @@ def create_weights( layer.w13_input_scale = None layer.w2_input_scale = None - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: pass def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: return int8_w8a8_moe_quant_config( w1_scale=layer.w13_weight_scale, @@ -1124,7 +1124,7 @@ def get_fused_moe_quant_config( def apply( self, - layer: torch.nn.Module, # RoutedExperts + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -1170,7 +1170,7 @@ def __init__( self.quant_type = WNA16_SUPPORTED_TYPES_MAP[self.num_bits] - self.marlin_input_dtype = get_marlin_input_dtype(layer_name) + self.marlin_input_dtype = get_marlin_input_dtype(layer_name) # ? self.use_flashinfer_mxint4_moe = ( is_flashinfer_mxint4_moe_available() and self.group_size == 32 @@ -1255,7 +1255,7 @@ def get_weight_shape( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -1416,7 +1416,7 @@ def create_weights( layer.a2_scale = None layer.marlin_state = GPTQMarlinState.REPACK - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: num_experts = layer.w13_weight_g_idx.shape[0] device = layer.w13_weight_g_idx.device if self.kernel_backend == "Flashinfer": @@ -1554,7 +1554,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.workspace = marlin_make_workspace_new(device, 4) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: if self.num_bits != 4: return None @@ -1569,7 +1569,7 @@ def get_fused_moe_quant_config( def select_gemm_impl( self, prepare_finalize: mk.FusedMoEPrepareAndFinalizeModular, - layer: torch.nn.Module, + layer: RoutedExperts, ) -> mk.FusedMoEExpertsModular: assert self.num_bits == 4, "only supporting w4" layer.w13_weight = layer.w13_weight_packed @@ -1696,7 +1696,7 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -1822,7 +1822,7 @@ def create_weights( layer.a13_scale = None layer.a2_scale = None - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: # Reconfigure packed weights and scales to match moe_wna16 format layer.w13_weight_packed = torch.nn.Parameter( layer.w13_weight_packed.transpose(1, 2).contiguous().view(torch.uint8), @@ -1840,7 +1840,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: ) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: assert self.num_bits == 4 or self.num_bits == 8 config_builder = ( @@ -1860,7 +1860,7 @@ def get_fused_moe_quant_config( def select_gemm_impl( self, prepare_finalize: mk.FusedMoEPrepareAndFinalizeModular, - layer: torch.nn.Module, + layer: RoutedExperts, ) -> mk.FusedMoEExpertsModular: if self.moe.is_lora_enabled: assert self.moe_quant_config is not None @@ -1971,7 +1971,7 @@ def __init__( # ---- parameter creation ---- def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -2065,7 +2065,7 @@ def _n_scale_cols(in_features: int) -> int: layer.group_size = g # post-load packing to dyn-4bit KleidiAI kernel's format - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: E = layer.w13_weight.shape[0] H = layer.w13_in_features I2 = layer.w13_out_features @@ -2174,7 +2174,7 @@ def _pack_matrix( ) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: # CPU dynamic 4-bit MoE path does not use modular kernels or # fused_experts; quant config is not needed. @@ -2267,7 +2267,7 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -2447,7 +2447,7 @@ def maybe_make_prepare_finalize( return super().maybe_make_prepare_finalize(routing_tables) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: # Store quantization scales; both per-group and per-channel # Note we haven't specified the group size here because @@ -2465,7 +2465,7 @@ def get_fused_moe_quant_config( def select_gemm_impl( self, prepare_finalize: mk.FusedMoEPrepareAndFinalizeModular, - layer: torch.nn.Module, + layer: RoutedExperts, ) -> mk.FusedMoEExpertsModular: assert self.moe_quant_config is not None assert ( diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 1c11249b0e4b..5a83e6360d2b 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -7,7 +7,6 @@ from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group from vllm.model_executor.layers.fused_moe import ( - FusedMoE, FusedMoEConfig, FusedMoEMethodBase, RoutedExperts, @@ -56,7 +55,7 @@ def get_quant_method( ) -> "QuantizeMethodBase | None": if isinstance(layer, LinearBase): return UnquantizedLinearMethod() - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): return ExpertsInt8MoEMethod(self, layer.moe_config) return None @@ -129,7 +128,7 @@ def create_weights( layer.register_parameter("w2_scale", w2_scale) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: return int8_w8a16_moe_quant_config( w1_scale=layer.w13_scale, w2_scale=layer.w2_scale, w1_zp=None, w2_zp=None diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index fac71b956b69..4bc1869e5baa 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -4,7 +4,6 @@ from typing import TYPE_CHECKING, Any import torch -from torch.nn import Module from torch.utils._python_dispatch import TorchDispatchMode import vllm.model_executor.layers.fused_moe.modular_kernel as mk @@ -21,15 +20,15 @@ vllm_is_batch_invariant, ) from vllm.model_executor.layers.fused_moe import ( - FusedMoE, + FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported, RoutedExperts, + UnquantizedFusedMoEMethod, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, ) -from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod from vllm.model_executor.layers.fused_moe.oracle.fp8 import ( convert_to_fp8_moe_kernel_format, make_fp8_moe_kernel, @@ -189,7 +188,7 @@ def get_quant_method( offline_method = Fp8LinearMethod(self) offline_method.marlin_input_dtype = get_marlin_input_dtype(prefix) return offline_method - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): if is_layer_skipped( prefix=prefix, ignored_layers=self.ignored_layers, @@ -197,9 +196,9 @@ def get_quant_method( ): return UnquantizedFusedMoEMethod(layer.moe_config) if self.is_checkpoint_fp8_serialized: - moe_quant_method = Fp8MoEMethod(self, layer) + moe_quant_method = Fp8MoEMethod(self, layer.moe_config) else: - moe_quant_method = Fp8OnlineMoEMethod(self, layer) + moe_quant_method = Fp8OnlineMoEMethod(self, layer.moe_config) return moe_quant_method elif isinstance(layer, Attention): return Fp8KVCacheMethod(self) @@ -318,7 +317,7 @@ def __init__(self, quant_config: Fp8Config): def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, input_size_per_partition: int, output_partition_sizes: list[int], input_size: int, @@ -380,7 +379,7 @@ def create_weights( set_weight_attrs(scale, {"scale_type": "input_scale"}) layer.register_parameter("input_scale", scale) - def process_weights_after_loading(self, layer: Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: if self.use_marlin: # Only Marlin kernels support `marlin_input_dtype`; guard to avoid # AttributeError if backend selection changes. @@ -578,7 +577,7 @@ def patched_weight_loader(param, loaded_weight, *args, **kwargs): layer._load_device = torch.get_default_device() layer.register_parameter("weight", weight) - def process_weights_after_loading(self, layer: Module) -> None: + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if getattr(layer, "_already_called_process_weights_after_loading", False): return @@ -632,8 +631,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): quant_config: The quantization config. """ - def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module): - super().__init__(layer.moe_config) + def __init__(self, quant_config: Fp8Config, moe_config: FusedMoEConfig): + super().__init__(moe_config) self.quant_config = quant_config self.weight_block_size = self.quant_config.weight_block_size self.block_quant: bool = self.weight_block_size is not None @@ -663,7 +662,7 @@ def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module): def create_weights( self, - layer: Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -844,7 +843,7 @@ def _setup_kernel( shared_experts=layer.shared_experts, ) - def process_weights_after_loading(self, layer: Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: # print(f"LAYER {layer}") if getattr(layer, "_already_called_process_weights_after_loading", False): return @@ -905,7 +904,7 @@ def maybe_make_prepare_finalize( "logic. This function should not be called." ) - def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig: + def get_fused_moe_quant_config(self, layer: RoutedExperts) -> FusedMoEQuantConfig: w1_scale = getattr(layer, f"w13_{self.weight_scale_name}") w2_scale = getattr(layer, f"w2_{self.weight_scale_name}") a1_scale = layer.w13_input_scale @@ -938,7 +937,7 @@ def supports_eplb(self) -> bool: def apply_monolithic( self, - layer: torch.nn.Module, # RoutedExperts + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor: @@ -961,7 +960,7 @@ def apply_monolithic( def apply( self, - layer: torch.nn.Module, # RoutedExperts + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -995,15 +994,15 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod): uses_meta_device: bool = True - def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module): - super().__init__(quant_config, layer) + def __init__(self, quant_config: Fp8Config, moe_config: FusedMoEConfig): + super().__init__(quant_config, moe_config) assert not quant_config.is_checkpoint_fp8_serialized assert quant_config.activation_scheme == "dynamic" assert quant_config.weight_block_size is None def create_weights( self, - layer: Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -1159,7 +1158,7 @@ def patched_weight_loader(param, loaded_weight, *args, **kwargs): layer.w13_input_scale = None layer.w2_input_scale = None - def process_weights_after_loading(self, layer: Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: if getattr(layer, "_already_called_process_weights_after_loading", False): return diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 141f85f81779..a60ee3d82b34 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -21,7 +21,6 @@ FusedMoEQuantConfig, ) from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, FusedMoEMethodBase, RoutedExperts, ) @@ -95,7 +94,7 @@ def get_quant_method( ): return UnquantizedEmbeddingMethod() return GGUFEmbeddingMethod(self) - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): # TODO: Select UnquantizedFusedMoEMethod on unquantized layers. return GGUFMoEMethod(self, layer.moe_config) return None @@ -566,7 +565,7 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -627,7 +626,7 @@ def create_weights( layer.register_parameter("w2_qweight_type", w2_qweight_type) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: return None diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 154347a930a9..5de721c4c52d 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -12,7 +12,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.layer import FusedMoE +from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts from vllm.model_executor.layers.linear import LinearMethodBase from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, @@ -171,7 +171,7 @@ def from_config(cls, config: dict[str, Any]) -> "GPTQConfig": def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> Union["GPTQLinearMethod", "QuantizeMethodBase"] | None: - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): # GPTQ MoE support: fall back to MoeWNA16 for broad compatibility from .moe_wna16 import MoeWNA16Config diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 79326c2410c4..cf050e07311c 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -15,7 +15,6 @@ choose_mp_linear_kernel, ) from vllm.model_executor.layers.fused_moe import ( - FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, RoutedExperts, @@ -66,29 +65,28 @@ def get_moe_quant_method( config: "GPTQMarlinConfig", - layer: torch.nn.Module, + layer: RoutedExperts, prefix: str, moe_method_cls: type, ): cloned_config = deepcopy(config) - if isinstance(layer, FusedMoE): - # False = skip module, None = no override, else = Positive match - if ( - get_dynamic_override( # noqa: E712 - cloned_config, # noqa: E712 - layer_name=prefix, - ) - == False - ): # noqa: E712 - return UnquantizedFusedMoEMethod(layer.moe_config) + assert isinstance(layer, RoutedExperts) + # False = skip module, None = no override, else = Positive match + if ( + get_dynamic_override( # noqa: E712 + cloned_config, # noqa: E712 + layer_name=prefix, + ) + == False + ): # noqa: E712 + return UnquantizedFusedMoEMethod(layer.moe_config) - if prefix: - # Dynamic per module/layer rules may override base config - override_config(cloned_config, prefix=prefix) + if prefix: + # Dynamic per module/layer rules may override base config + override_config(cloned_config, prefix=prefix) - return moe_method_cls(cloned_config, layer.moe_config) - return None + return moe_method_cls(cloned_config, layer.moe_config) class GPTQMarlinConfig(QuantizationConfig): @@ -242,7 +240,7 @@ def override_quantization_method( def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> "QuantizeMethodBase | None": - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config if not check_moe_marlin_supports_layer(layer, self.group_size): @@ -505,7 +503,7 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -659,7 +657,7 @@ def create_weights( device = layer.w13_qweight.device layer.workspace = marlin_make_workspace_new(device, 4) - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: is_a_8bit = self.input_dtype is not None and self.input_dtype.itemsize == 1 if is_a_8bit: @@ -789,7 +787,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.w2_bias.data = marlin_permute_bias(layer.w2_bias) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: from vllm.model_executor.layers.fused_moe.config import ( gptq_marlin_moe_quant_config, @@ -813,7 +811,7 @@ def get_fused_moe_quant_config( def select_gemm_impl( self, prepare_finalize, - layer: torch.nn.Module, + layer: RoutedExperts, ): """ Select the GEMM implementation for GPTQ-Marlin MoE. diff --git a/vllm/model_executor/layers/quantization/inc.py b/vllm/model_executor/layers/quantization/inc.py index 359f24688ce9..6b95ccb3badc 100644 --- a/vllm/model_executor/layers/quantization/inc.py +++ b/vllm/model_executor/layers/quantization/inc.py @@ -8,6 +8,7 @@ import torch from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod from vllm.model_executor.layers.quantization import ( QuantizationConfig, @@ -224,7 +225,6 @@ def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): self.extra_config = hf_to_vllm_mapper.apply_dict(self.extra_config) def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"): - from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.quantization.utils.marlin_utils import ( check_marlin_supported, check_moe_marlin_supports_layer, @@ -254,7 +254,7 @@ def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"): AWQ_TYPE_MAP[weight_bits], group_size, not sym ) - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): use_marlin = use_marlin and check_moe_marlin_supports_layer( layer, group_size ) @@ -288,7 +288,7 @@ def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"): zero_point=not sym, ) - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): if use_marlin: return AWQMarlinMoEMethod(quant_args_marlin, layer.moe_config) from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config @@ -310,7 +310,6 @@ def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"): return None def apply_gptq_quant_layer(self, layer, prefix: str, backend: str = "auto"): - from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.quantization.utils.marlin_utils import ( check_marlin_supported, check_moe_marlin_supports_layer, @@ -339,7 +338,7 @@ def apply_gptq_quant_layer(self, layer, prefix: str, backend: str = "auto"): use_marlin = (weight_bits, sym) in GPTQ_TYPE_MAP and check_marlin_supported( GPTQ_TYPE_MAP[(weight_bits, sym)], group_size, has_zp=not sym ) - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): use_marlin = use_marlin and check_moe_marlin_supports_layer( layer, group_size ) @@ -375,7 +374,7 @@ def apply_gptq_quant_layer(self, layer, prefix: str, backend: str = "auto"): dynamic={}, ) - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): if use_marlin: return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe_config) else: diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index b4a53c8c3110..844624a357e7 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -12,7 +12,6 @@ from vllm.model_executor.kernels.linear import init_fp8_linear_kernel from vllm.model_executor.layers.attention import Attention, MLAAttention from vllm.model_executor.layers.fused_moe import ( - FusedMoE, FusedMoeWeightScaleSupported, RoutedExperts, ) @@ -208,7 +207,7 @@ def get_quant_method( if getattr(quant_method, "backend", "") == "marlin": quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix) return quant_method - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): quant_method = self.FusedMoEMethodCls( quant_config=self, moe_config=layer.moe_config ) @@ -757,7 +756,7 @@ def maybe_make_prepare_finalize( def select_gemm_impl( self, prepare_finalize: mk.FusedMoEPrepareAndFinalizeModular, - layer: torch.nn.Module, + layer: RoutedExperts, ) -> mk.FusedMoEExpertsModular: raise ValueError( f"{self.__class__.__name__} uses the new modular kernel initialization " @@ -766,7 +765,7 @@ def select_gemm_impl( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -883,7 +882,7 @@ def _setup_kernel( shared_experts=layer.shared_experts, ) - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: w13 = layer.w13_weight w2 = layer.w2_weight w13_scale = layer.w13_weight_scale @@ -914,7 +913,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer, w13, w2, w13_scale, w2_scale, w13_input_scale, w2_input_scale ) - def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig: + def get_fused_moe_quant_config(self, layer: RoutedExperts) -> FusedMoEQuantConfig: w1_scale = layer.w13_weight_scale w2_scale = layer.w2_weight_scale a1_scale = layer.w13_input_scale @@ -1224,7 +1223,7 @@ def uses_weight_scale_2_pattern(self) -> bool: def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -1397,7 +1396,7 @@ def process_weights_after_loading(self, layer: RoutedExperts) -> None: ) self.moe_kernel.fused_experts.process_weights_after_loading(layer) - def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig: + def get_fused_moe_quant_config(self, layer: RoutedExperts) -> FusedMoEQuantConfig: return make_nvfp4_moe_quant_config( backend=self.nvfp4_backend, w13_scale=layer.w13_weight_scale, @@ -1717,7 +1716,7 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -1908,7 +1907,7 @@ def _shuffle_weights_for_trtllm(self, layer: torch.nn.Module) -> None: torch.stack(w2_scale_shuffled).contiguous(), ) - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: if getattr(layer, "_already_called_process_weights_after_loading", False): return @@ -1928,7 +1927,7 @@ def maybe_make_prepare_finalize( def select_gemm_impl( self, prepare_finalize: mk.FusedMoEPrepareAndFinalizeModular, - layer: torch.nn.Module, + layer: RoutedExperts, ) -> mk.FusedMoEExpertsModular: raise ValueError( f"{self.__class__.__name__} uses the new modular kernel initialization " @@ -1936,7 +1935,7 @@ def select_gemm_impl( ) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: # TRTLLM MXFP8 path is monolithic and does not use modular kernel config. return None @@ -2151,7 +2150,7 @@ def _resolve_quant_algo(self, prefix: str) -> str | None: Tries three strategies in order: 1. Direct lookup in ``quantized_layers``. 2. Packed/fused-layer lookup (unfuse via ``packed_modules_mapping``). - 3. Prefix-based lookup for FusedMoE (any child key starts with + 3. Prefix-based lookup for RoutedExperts (any child key starts with ``prefix + "."``). Returns the upper-cased quant_algo string, or *None* if the prefix @@ -2178,7 +2177,7 @@ def _resolve_quant_algo(self, prefix: str) -> str | None: f"{algos}. All shards must use the same quantization." ) - # 3. Prefix-based lookup (for FusedMoE / parent modules) + # 3. Prefix-based lookup (for RoutedExperts / parent modules) prefix_dot = prefix + "." for key, info in self.quantized_layers.items(): if key.startswith(prefix_dot): @@ -2212,7 +2211,7 @@ def get_quant_method( # Layer not in quantized_layers — leave unquantized return UnquantizedLinearMethod() - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): if quant_algo == "FP8": return ModelOptFp8MoEMethod( quant_config=self.fp8_config, diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index d1073d23b3bb..03de2361ceb5 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -7,7 +7,6 @@ from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group from vllm.model_executor.layers.fused_moe import ( - FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported, @@ -167,7 +166,7 @@ def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> "QuantizeMethodBase | None": if is_layer_skipped_quant(prefix, self.modules_to_not_convert): - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): return UnquantizedFusedMoEMethod(layer.moe_config) return UnquantizedLinearMethod() elif isinstance(layer, LinearBase): @@ -203,7 +202,7 @@ def get_quant_method( ) else: raise ValueError("moe_wna16 only support gptq and awq.") - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): return MoeWNA16Method(self, layer.moe_config) return None @@ -225,7 +224,7 @@ def __init__(self, quant_config: MoeWNA16Config, moe: "FusedMoEConfig") -> None: def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -344,7 +343,7 @@ def create_weights( set_weight_attrs(param, extra_weight_attrs) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: weight_bits = self.quant_config.weight_bits has_zp = self.quant_config.has_zp diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 9e7c22f57f41..55fb1ede2258 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -7,7 +7,6 @@ from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import ( - FusedMoE, FusedMoEConfig, FusedMoEMethodBase, RoutedExperts, @@ -78,7 +77,7 @@ def get_quant_method( scope="local", ) return UnquantizedLinearMethod() - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): return Mxfp4MoEMethod(layer.moe_config) elif isinstance(layer, Attention): logger.debug_once( @@ -132,7 +131,7 @@ def skip_forward_padding(self) -> bool: def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -330,7 +329,7 @@ def _setup_kernel( shared_experts=layer.shared_experts, ) - def process_weights_after_loading(self, layer): + def process_weights_after_loading(self, layer: RoutedExperts) -> None: w13 = layer.w13_weight w2 = layer.w2_weight w13_scale = layer.w13_weight_scale @@ -344,7 +343,7 @@ def process_weights_after_loading(self, layer): self._setup_kernel(layer, w13, w2, w13_scale, w2_scale, w13_bias, w2_bias) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: w1_scale = layer.w13_weight_scale w2_scale = layer.w2_weight_scale @@ -368,7 +367,7 @@ def get_fused_moe_quant_config( def select_gemm_impl( self, prepare_finalize: mk.FusedMoEPrepareAndFinalize, - layer: torch.nn.Module, + layer: RoutedExperts, ) -> mk.FusedMoEExpertsModular: raise ValueError( f"{self.__class__.__name__} uses the new modular kernel " diff --git a/vllm/model_executor/layers/quantization/mxfp8.py b/vllm/model_executor/layers/quantization/mxfp8.py index 5b4564bea31c..ab3b797586a4 100644 --- a/vllm/model_executor/layers/quantization/mxfp8.py +++ b/vllm/model_executor/layers/quantization/mxfp8.py @@ -6,15 +6,15 @@ from typing import Any import torch -from torch.nn import Module from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import ( - FusedMoE, + FusedMoEConfig, FusedMoEMethodBase, + RoutedExperts, + UnquantizedFusedMoEMethod, ) -from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod from vllm.model_executor.layers.fused_moe.oracle.mxfp8 import ( select_mxfp8_moe_backend, ) @@ -105,7 +105,7 @@ def get_quant_method( ): return UnquantizedLinearMethod() return Mxfp8OnlineLinearMethod(self) - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): if is_layer_skipped( prefix=prefix, ignored_layers=self.ignored_layers, @@ -113,7 +113,7 @@ def get_quant_method( skip_with_substr=True, ): return UnquantizedFusedMoEMethod(layer.moe_config) - return Mxfp8OnlineMoEMethod(self, layer) + return Mxfp8OnlineMoEMethod(self, layer.moe_config) elif isinstance(layer, Attention): return Fp8KVCacheMethod(self) return None @@ -179,7 +179,7 @@ def create_weights( **extra_weight_attrs, ) - def process_weights_after_loading(self, layer: Module) -> None: + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if getattr(layer, "_already_called_process_weights_after_loading", False): return @@ -226,8 +226,8 @@ class Mxfp8OnlineMoEMethod(Fp8OnlineMoEMethod): uses_meta_device: bool = True - def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module): - FusedMoEMethodBase.__init__(self, layer.moe_config) + def __init__(self, quant_config: Fp8Config, moe_config: FusedMoEConfig): + FusedMoEMethodBase.__init__(self, moe_config) self.quant_config = quant_config assert not quant_config.is_checkpoint_fp8_serialized assert quant_config.activation_scheme == "dynamic" @@ -240,7 +240,7 @@ def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module): def create_weights( self, - layer: Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -305,7 +305,7 @@ def _quantize_mxfp8_moe_weight( return torch.stack(w_quant), torch.stack(w_scales) - def process_weights_after_loading(self, layer: Module) -> None: + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if getattr(layer, "_already_called_process_weights_after_loading", False): return diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index 78c64bac6187..04562f895d5f 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -8,7 +8,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, @@ -147,7 +147,7 @@ def get_quant_method( if isinstance(layer, Attention): return QuarkKVCacheMethod(self) - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): return QuarkMoEMethod.get_moe_method(self, module=layer, layer_name=prefix) return None diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index b1c29cccad93..4ebf30a5260b 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -68,7 +68,7 @@ def __init__(self, moe: FusedMoEConfig): @staticmethod def get_moe_method( quant_config: "QuarkConfig", # type: ignore # noqa E501 # noqa F821 - module: torch.nn.Module, + module: RoutedExperts, layer_name: str, ) -> "QuarkMoEMethod": layer_quant_config = quant_config._find_matched_config(layer_name, module) @@ -166,7 +166,7 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -291,7 +291,7 @@ def create_weights( else: layer.w13_bias, layer.w2_bias = None, None - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: # Fp8 moe kernels require a single activation scale. # We take the max of all the scales in case they differ. if self.static_input_scales: @@ -426,7 +426,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: ) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: return fp8_w8a8_moe_quant_config( w1_scale=layer.w13_weight_scale, @@ -518,7 +518,7 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -585,7 +585,7 @@ def create_weights( set_weight_attrs(w13_weight_scale_2, extra_weight_attrs) set_weight_attrs(w2_weight_scale_2, extra_weight_attrs) - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( layer.w13_weight.data, layer.w2_weight.data ) @@ -788,7 +788,7 @@ def get_packed_dim(self, dim: int, quant_dtype: str): def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -911,7 +911,7 @@ def create_weights( layer.w13_input_scale = None layer.w2_input_scale = None - def process_weights_after_loading(self, layer): + def process_weights_after_loading(self, layer: RoutedExperts) -> None: if self.static_input_scales and self.input_dtype == "fp8": # firstly, process activations if fp8 static input if layer.w13_input_scale is None or layer.w2_input_scale is None: @@ -999,7 +999,7 @@ def process_weights_after_loading(self, layer): torch.accelerator.empty_cache() def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: if self.ocp_mx_scheme == "w_mxfp4": return mxfp4_w4a16_moe_quant_config( @@ -1086,7 +1086,7 @@ def __init__( ): super().__init__(weight_config, input_config, moe) - def process_weights_after_loading(self, layer): + def process_weights_after_loading(self, layer: RoutedExperts) -> None: from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig w13_bias = layer.w13_bias.to(torch.float32) @@ -1157,7 +1157,7 @@ def process_weights_after_loading(self, layer): ) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: return mxfp4_w4a8_moe_quant_config( w1_scale=self.w13_precision_config, @@ -1175,7 +1175,7 @@ def is_monolithic(self) -> bool: def apply_monolithic( self, - layer: torch.nn.Module, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, expert_map: torch.Tensor | None = None, diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index d659effd70ff..39e1083a81dd 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -8,6 +8,7 @@ import vllm.envs as envs from vllm import _custom_ops as ops from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe import RoutedExperts from vllm.model_executor.layers.linear import LinearBase from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.utils.int8_utils import ( @@ -226,7 +227,7 @@ def check_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool: )[0] -def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool: +def check_moe_marlin_supports_layer(layer: RoutedExperts, group_size: int) -> bool: if current_platform.is_rocm(): return False hidden_size = layer.hidden_size @@ -471,7 +472,7 @@ def get__quant_fp8_method() -> QuantFP8: return _quant_fp8_method -def get_marlin_input_dtype(prefix: str | None = None): +def get_marlin_input_dtype(prefix: str | None = None): # ? if envs.VLLM_MARLIN_INPUT_DTYPE is None: return elif envs.VLLM_MARLIN_INPUT_DTYPE.lower() == "int8": From 54f88b20607ec96c8024fcd62f0b87eb7e086bff Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sat, 28 Mar 2026 01:16:55 +0000 Subject: [PATCH 091/191] fix? weight loader Signed-off-by: Bill Nell --- vllm/lora/layers/fused_moe.py | 4 +- .../layers/fused_moe/eplb_manager.py | 63 +---------- .../layers/fused_moe/expert_map_manager.py | 7 +- vllm/model_executor/layers/fused_moe/layer.py | 101 +++++++++--------- .../layers/fused_moe/modular_kernel.py | 1 + .../layers/fused_moe/routed_experts.py | 73 +++++++++++-- .../model_loader/bitsandbytes_loader.py | 10 +- .../model_loader/ep_weight_filter.py | 2 +- vllm/model_executor/models/bailing_moe.py | 2 +- .../models/bailing_moe_linear.py | 2 +- vllm/model_executor/models/ernie45_vl_moe.py | 2 +- vllm/model_executor/models/exaone_moe.py | 2 +- vllm/model_executor/models/exaone_moe_mtp.py | 2 +- vllm/model_executor/models/gpt_oss.py | 2 +- vllm/model_executor/models/hunyuan_v1.py | 2 +- vllm/model_executor/models/olmoe.py | 2 +- vllm/model_executor/models/qwen2_moe.py | 2 +- vllm/model_executor/models/qwen3_5.py | 2 +- vllm/model_executor/models/qwen3_5_mtp.py | 2 +- vllm/model_executor/models/qwen3_moe.py | 2 +- vllm/model_executor/models/qwen3_next.py | 2 +- vllm/model_executor/models/qwen3_next_mtp.py | 2 +- vllm/model_executor/models/qwen3_vl_moe.py | 2 +- vllm/model_executor/models/sarvam.py | 2 +- 24 files changed, 142 insertions(+), 151 deletions(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 19780c522469..63dc86a677a4 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -115,7 +115,7 @@ def _get_lora_moe_configs( top_k=top_k, dtype=config_dtype, M=M, - block_shape=layer.quant_method.moe_quant_config.block_shape, + block_shape=layer.routed_experts.quant_method.moe_quant_config.block_shape, ) shrink_config = get_config_func( op_type=f"fused_moe_lora_{op_prefix}_shrink" @@ -132,7 +132,7 @@ def _inject_lora_into_fused_moe(self): top_k = self.base_layer.top_k self.base_layer.ensure_moe_quant_config_init() - quant_config = self.base_layer.quant_method.moe_quant_config + quant_config = self.base_layer.routed_experts.quant_method.moe_quant_config if getattr(self.base_layer.quant_method, "supports_internal_mk", False): # Use the existing modular kernel from the quant method diff --git a/vllm/model_executor/layers/fused_moe/eplb_manager.py b/vllm/model_executor/layers/fused_moe/eplb_manager.py index 6d43554ba7ea..3e86465aa320 100644 --- a/vllm/model_executor/layers/fused_moe/eplb_manager.py +++ b/vllm/model_executor/layers/fused_moe/eplb_manager.py @@ -13,7 +13,7 @@ import torch -from vllm.distributed.eplb.eplb_state import EplbLayerState, EplbState +from vllm.distributed.eplb.eplb_state import EplbLayerState class EplbManager: @@ -153,67 +153,6 @@ def _maybe_make_contiguous( and not name.startswith("_runner.routed_output_transform.") ] - @staticmethod - def make_expert_params_mapping( - model: torch.nn.Module, - ckpt_gate_proj_name: str, - ckpt_down_proj_name: str, - ckpt_up_proj_name: str, - num_experts: int, - num_redundant_experts: int = 0, - ) -> list[tuple[str, str, int, str]]: - """ - Create expert parameter mapping for weight loading with redundant experts. - - In the returned mapping: - - `expert_id` is the physical expert id - - `weight_name` contains the weight name of the logical expert - So that we map the expert id to logical in `weight_name` - - Args: - model: The model containing the MoE layer - ckpt_gate_proj_name: Checkpoint parameter name for gate projection - ckpt_down_proj_name: Checkpoint parameter name for down projection - ckpt_up_proj_name: Checkpoint parameter name for up projection - num_experts: Number of logical experts - num_redundant_experts: Number of redundant experts for EPLB - - Returns: - List of (param_name, weight_name, expert_id, shard_id) tuples - """ - num_physical_experts = num_experts + num_redundant_experts - - # Build initial physical-to-logical mapping - physical_to_logical_map = ( - EplbState.build_initial_global_physical_to_logical_map( - num_experts, num_redundant_experts - ) - ) - - base_layer = ( - "base_layer." - if any(".base_layer." in name for name, _ in model.named_parameters()) - else "" - ) - - return [ - # (param_name, weight_name, expert_id, shard_id) - ( - f"experts.{base_layer}w13_" - if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name] - else f"experts.{base_layer}w2_", - f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}", - expert_id, - shard_id, - ) - for expert_id in range(num_physical_experts) - for shard_id, weight_name in [ - ("w1", ckpt_gate_proj_name), - ("w2", ckpt_down_proj_name), - ("w3", ckpt_up_proj_name), - ] - ] - @staticmethod def validate_configuration( global_num_experts: int, diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 54ca501a125d..3c5dfcd55d2b 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -235,12 +235,7 @@ def map_global_to_local(self, global_id: int) -> int: if self._expert_map is None: return global_id - local_id = self._expert_map[global_id].item() - if local_id == -1: - raise ValueError( - f"Expert {global_id} is not assigned to rank {self.ep_rank}" - ) - return local_id + return self._expert_map[global_id].item() def is_local_expert(self, global_id: int) -> bool: """Check if expert is assigned to this rank.""" diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 95a647faa3a8..fc00aa9d9bb2 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable, Iterable -from typing import Literal, overload import torch @@ -136,7 +135,7 @@ def maybe_roundup_hidden_size( def register_layer_for_moe_forward_op( vllm_config: VllmConfig, - layer: torch.nn.Module, + layer: torch.nn.Module, # FusedMoE for now ): # For smuggling this layer into the fused moe custom op prefix = layer.layer_name @@ -502,7 +501,7 @@ def __init__( ) # HACK - self.quant_method = self.routed_experts.quant_method + # self.quant_method = self.routed_experts.quant_method # Move XXXXXXXXXXXXX if eplb_manager is not None and not self.quant_method.supports_eplb: @@ -761,7 +760,7 @@ def make_expert_params_mapping( num_redundant_experts: int = 0, ) -> list[tuple[str, str, int, str]]: """Delegate to EPLB manager.""" - return EplbManager.make_expert_params_mapping( + return RoutedExperts.make_expert_params_mapping( model, ckpt_gate_proj_name, ckpt_down_proj_name, @@ -774,52 +773,52 @@ def make_expert_params_mapping( # Weight Loading (Delegated to RoutedExperts) # - @overload - def weight_loader( - self, - param: torch.nn.Parameter, - loaded_weight: torch.Tensor, - weight_name: str, - shard_id: str, - expert_id: int, - return_success: Literal[False], - ) -> None: ... - - @overload - def weight_loader( - self, - param: torch.nn.Parameter, - loaded_weight: torch.Tensor, - weight_name: str, - shard_id: str, - expert_id: int, - return_success: Literal[True], - ) -> bool: ... - - def weight_loader( - self, - param: torch.nn.Parameter, - loaded_weight: torch.Tensor, - weight_name: str, - shard_id: str, - expert_id: int, - return_success: bool = False, - ) -> bool | None: - """Delegate to RoutedExperts.""" - return self.routed_experts.weight_loader( - param=param, - loaded_weight=loaded_weight, - weight_name=weight_name, - shard_id=shard_id, - expert_id=expert_id, - return_success=return_success, - ) - - def load_weights( - self, weights: Iterable[tuple[str, torch.Tensor]] - ) -> Iterable[str]: - """Delegate to RoutedExperts.""" - return self.routed_experts.load_weights(weights) + # @overload + # def weight_loader( + # self, + # param: torch.nn.Parameter, + # loaded_weight: torch.Tensor, + # weight_name: str, + # shard_id: str, + # expert_id: int, + # return_success: Literal[False], + # ) -> None: ... + + # @overload + # def weight_loader( + # self, + # param: torch.nn.Parameter, + # loaded_weight: torch.Tensor, + # weight_name: str, + # shard_id: str, + # expert_id: int, + # return_success: Literal[True], + # ) -> bool: ... + + # def weight_loader( + # self, + # param: torch.nn.Parameter, + # loaded_weight: torch.Tensor, + # weight_name: str, + # shard_id: str, + # expert_id: int, + # return_success: bool = False, + # ) -> bool | None: + # """Delegate to RoutedExperts.""" + # return self.routed_experts.weight_loader( + # param=param, + # loaded_weight=loaded_weight, + # weight_name=weight_name, + # shard_id=shard_id, + # expert_id=expert_id, + # return_success=return_success, + # ) + + # def load_weights( + # self, weights: Iterable[tuple[str, torch.Tensor]] + # ) -> Iterable[str]: + # """Delegate to RoutedExperts.""" + # return self.routed_experts.load_weights(weights) # # Execution @@ -845,4 +844,4 @@ def forward_cuda( # Mark the FusedMoE weight_loader as supporting MoE-specific parameters # to avoid expensive runtime reflection in model loading code -FusedMoE.weight_loader.supports_moe_loading = True # type: ignore[attr-defined] +# FusedMoE.weight_loader.supports_moe_loading = True # type: ignore[attr-defined] diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 84eafde51d78..ef467328db99 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -493,6 +493,7 @@ def __init__( self.max_num_tokens = max_num_tokens self.num_dispatchers = num_dispatchers + # RoutedExperts def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # noqa: B027 pass diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index d8b427276576..02579ada9a88 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -12,6 +12,7 @@ from vllm.distributed import ( get_dp_group, ) +from vllm.distributed.eplb.eplb_state import EplbState from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, @@ -47,9 +48,6 @@ class RoutedExperts(torch.nn.Module): and handles: - Loading checkpoint weights into parameters - Executing routed experts via quant_method.apply() - - Weight parameters are registered on this module via _ParameterRegistrationWrapper - during FusedMoE initialization. """ def __init__( @@ -412,9 +410,7 @@ def weight_loader( quant_method_name = self.quant_method.__class__.__name__ global_expert_id = expert_id - expert_id = self.layer._map_global_expert_id_to_local_expert_id( - global_expert_id - ) + expert_id = self._map_global_expert_id_to_local_expert_id(global_expert_id) use_global_sf = ( getattr(self.quant_method, "use_global_sf", False) @@ -664,9 +660,9 @@ def weight_loader( def load_weights( self, weights: Iterable[tuple[str, torch.Tensor]] ) -> Iterable[str]: - if (expert_mapping := self.layer.expert_mapping) is None: + if (expert_mapping := self.expert_mapping) is None: raise ValueError( - "`self.layer.expert_mapping` must be provided to " + "`self.expert_mapping` must be provided to " "load weights using `self.load_weights`." ) for expert_name, loaded_weight in weights: @@ -713,6 +709,67 @@ def load_weights( ) yield param_name + @staticmethod + def make_expert_params_mapping( + model: torch.nn.Module, + ckpt_gate_proj_name: str, + ckpt_down_proj_name: str, + ckpt_up_proj_name: str, + num_experts: int, + num_redundant_experts: int = 0, + ) -> list[tuple[str, str, int, str]]: + """ + Create expert parameter mapping for weight loading with redundant experts. + + In the returned mapping: + - `expert_id` is the physical expert id + - `weight_name` contains the weight name of the logical expert + So that we map the expert id to logical in `weight_name` + + Args: + model: The model containing the MoE layer + ckpt_gate_proj_name: Checkpoint parameter name for gate projection + ckpt_down_proj_name: Checkpoint parameter name for down projection + ckpt_up_proj_name: Checkpoint parameter name for up projection + num_experts: Number of logical experts + num_redundant_experts: Number of redundant experts for EPLB + + Returns: + List of (param_name, weight_name, expert_id, shard_id) tuples + """ + num_physical_experts = num_experts + num_redundant_experts + + # Build initial physical-to-logical mapping + physical_to_logical_map = ( + EplbState.build_initial_global_physical_to_logical_map( + num_experts, num_redundant_experts + ) + ) + + base_layer = ( + "base_layer." + if any(".base_layer." in name for name, _ in model.named_parameters()) + else "" + ) + + return [ + # (param_name, weight_name, expert_id, shard_id) + ( + f".experts.routed_experts.{base_layer}w13_" + if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name] + else f".experts.routed_experts.{base_layer}w2_", + f".experts.{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}", + expert_id, + shard_id, + ) + for expert_id in range(num_physical_experts) + for shard_id, weight_name in [ + ("w1", ckpt_gate_proj_name), + ("w2", ckpt_down_proj_name), + ("w3", ckpt_up_proj_name), + ] + ] + # # Execution # diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 81526415ff2d..0fbf4db2ebd9 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -23,7 +23,7 @@ ) from vllm.logger import init_logger from vllm.lora.utils import is_moe_model -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import RoutedExperts from vllm.model_executor.layers.linear import ( LinearBase, MergedColumnParallelLinear, @@ -465,13 +465,13 @@ def _get_bnb_target_modules(self, model: nn.Module) -> None: self.target_modules.append(name) if module.disable_tp: self.tp_disabled_modules.append(name) - elif isinstance(module, FusedMoE) and hasattr( + elif isinstance(module, RoutedExperts) and hasattr( module.quant_method, "quant_config" ): # TODO: support FusedMoE with prequant and 8bit. if self.pre_quant and self.load_8bit: raise ValueError( - "Prequant BitsAndBytes 8bit models with FusedMoE " + "Prequant BitsAndBytes 8bit models with RoutedExperts " "is not supported yet." ) # Get the corresponding weight name using module name and @@ -509,7 +509,7 @@ def _classify_module_sharding(self, model: nn.Module): # dimension (dim=-1) elif isinstance(module, (RowParallelLinear,)): self.column_sharded_weights_modules.append(name) - elif isinstance(module, FusedMoE): + elif isinstance(module, RoutedExperts): expert_mapping = self.expert_params_mapping for exp in expert_mapping: if exp[-1] == "w2": @@ -630,7 +630,7 @@ def _fuse_moe_quant_states(self, model: nn.Module, quant_states_dict: dict) -> d expert_mapping = self.expert_params_mapping expert_qs_dict = {} for name, module in model.named_modules(): - if not isinstance(module, FusedMoE): + if not isinstance(module, RoutedExperts): continue w1_states_lst = [] w2_states_lst = [] diff --git a/vllm/model_executor/model_loader/ep_weight_filter.py b/vllm/model_executor/model_loader/ep_weight_filter.py index 190842379253..48bfacc6ee05 100644 --- a/vllm/model_executor/model_loader/ep_weight_filter.py +++ b/vllm/model_executor/model_loader/ep_weight_filter.py @@ -13,7 +13,7 @@ # Matches per-expert weight names like ".experts.42.gate_proj.weight". # Does NOT match 3D fused-expert names like ".experts.gate_proj.weight" # (no numeric id) — those are intentionally left unfiltered so the full -# tensor is loaded and sliced later by FusedMoE.weight_loader. +# tensor is loaded and sliced later by RoutedExperts.weight_loader. _EXPERT_ID_RE = re.compile(r"\.experts\.(\d+)\.") diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index 510d605f8046..93884a6a05ff 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -490,7 +490,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue - if "mlp.experts" in name: + if "mlp.experts" in name: # XXXXXXXXXXXXX continue name = name.replace(weight_name, param_name) # Skip loading extra bias for GPTQ models. diff --git a/vllm/model_executor/models/bailing_moe_linear.py b/vllm/model_executor/models/bailing_moe_linear.py index 8769e519702a..b8bc34325190 100644 --- a/vllm/model_executor/models/bailing_moe_linear.py +++ b/vllm/model_executor/models/bailing_moe_linear.py @@ -1104,7 +1104,7 @@ def normalize_name(name: str) -> str | None: continue # Handle expert weights - if "mlp.experts" in norm_name: + if "mlp.experts" in norm_name: # XXXXXXXXXXXXXXXXXXXX # Expert bias if ( "mlp.experts.e_score_correction_bias" in norm_name diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index e4b7ac6fb006..fa70f1c7acf0 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -690,7 +690,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: break else: # Distinguish between vision experts and text experts - if "mlp.experts" in name: + if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXX moe_offset = int(name.split(".")[-3]) vision_expert_start_idx = self.config.moe_num_experts[0] is_text_expert = moe_offset <= vision_expert_start_idx - 1 diff --git a/vllm/model_executor/models/exaone_moe.py b/vllm/model_executor/models/exaone_moe.py index a46cadf007ee..40b12d66e4d2 100644 --- a/vllm/model_executor/models/exaone_moe.py +++ b/vllm/model_executor/models/exaone_moe.py @@ -387,7 +387,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue - if "mlp.experts" in name: + if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXXXXX continue name = name.replace(weight_name, param_name) # Skip loading extra bias for GPTQ models. diff --git a/vllm/model_executor/models/exaone_moe_mtp.py b/vllm/model_executor/models/exaone_moe_mtp.py index b3c71e6aef6e..e84ddc8c9d4e 100644 --- a/vllm/model_executor/models/exaone_moe_mtp.py +++ b/vllm/model_executor/models/exaone_moe_mtp.py @@ -149,7 +149,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if weight_name not in name: continue - if "mlp.experts" in name: + if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXX continue name = name.replace(weight_name, param_name) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 083defc9c2db..bd43d35c3dad 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -668,7 +668,7 @@ def kv_cache_scale_loader( continue if ( - all(key in name for key in ["input_scale", "mlp.experts"]) + all(key in name for key in ["input_scale", "mlp.experts"]) # XXXXX and expert_id is not None ): assert loaded_weight.numel() == 1 diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index 35d30006a66a..b5967fdebab6 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -782,7 +782,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue - if "mlp.experts" in name: + if "mlp.experts" in name: # XXXXXXXXXXXXX continue # cross layer only have q_proj, skip qkv pack if weight_name == ".q_proj": diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index fcde2e41afbb..fe41c584ac2e 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -366,7 +366,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # name will be updated to mlp.experts[0].gate_up_proj, which # will then be updated below in expert_params_mapping # for mlp.experts[0].gate_gate_up_proj, which breaks load. - if "mlp.experts" in name: + if "mlp.experts" in name: # XXXXXXXXXXXXXXXX continue name = name.replace(weight_name, param_name) # Skip loading extra bias for GPTQ models. diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 1234aad77be2..be28d3772d11 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -447,7 +447,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # name will be updated to mlp.experts[0].gate_up_proj, which # will then be updated below in expert_params_mapping # for mlp.experts[0].gate_gate_up_proj, which breaks load. - if "mlp.experts" in name: + if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXXXX continue name = name.replace(weight_name, param_name) # Skip loading extra bias for GPTQ models. diff --git a/vllm/model_executor/models/qwen3_5.py b/vllm/model_executor/models/qwen3_5.py index daca52821e0f..b22fdc5902bc 100644 --- a/vllm/model_executor/models/qwen3_5.py +++ b/vllm/model_executor/models/qwen3_5.py @@ -485,7 +485,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if weight_name not in name: continue - if "mlp.experts" in name: + if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXXXXXXXX continue name = name.replace(weight_name, param_name) diff --git a/vllm/model_executor/models/qwen3_5_mtp.py b/vllm/model_executor/models/qwen3_5_mtp.py index 0eca47492c91..0f74b913ad05 100644 --- a/vllm/model_executor/models/qwen3_5_mtp.py +++ b/vllm/model_executor/models/qwen3_5_mtp.py @@ -217,7 +217,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if weight_name not in name: continue - if "mlp.experts" in name: + if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXXXXXXX continue name = name.replace(weight_name, param_name) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index f0f69d435379..b62c765201ce 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -568,7 +568,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # name will be updated to mlp.experts[0].gate_up_proj, which # will then be updated below in expert_params_mapping # for mlp.experts[0].gate_gate_up_proj, which breaks load. - if "mlp.experts" in name: + if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXXXXXXXX continue name = name.replace(weight_name, param_name) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 7e0544fda88d..787a6e749f60 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -1458,7 +1458,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if weight_name not in name: continue - if "mlp.experts" in name: + if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXX continue name = name.replace(weight_name, param_name) diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py index 751d7c23eb97..f3011f604427 100644 --- a/vllm/model_executor/models/qwen3_next_mtp.py +++ b/vllm/model_executor/models/qwen3_next_mtp.py @@ -163,7 +163,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if weight_name not in name: continue - if "mlp.experts" in name: + if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXX continue name = name.replace(weight_name, param_name) diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index a9c01ccf5959..df13347e2b9b 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -208,7 +208,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # name will be updated to mlp.experts[0].gate_up_proj, which # will then be updated below in expert_params_mapping # for mlp.experts[0].gate_gate_up_proj, which breaks load. - if "mlp.experts" in name: + if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXXX continue name = name.replace(weight_name, param_name) # Skip loading extra parameters for GPTQ/modelopt models. diff --git a/vllm/model_executor/models/sarvam.py b/vllm/model_executor/models/sarvam.py index fa5ec44d7e72..5544e0fe4cbd 100644 --- a/vllm/model_executor/models/sarvam.py +++ b/vllm/model_executor/models/sarvam.py @@ -570,7 +570,7 @@ def load_weights( for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue - if "mlp.experts" in name: + if "mlp.experts" in name: # XXXXXXXXXXXXXXXXX continue new_name = name.replace(weight_name, param_name) if new_name.endswith(".bias") and new_name not in params_dict: From 1f385afdd4d80a6bf1f4db904e4c3138243867f6 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sat, 28 Mar 2026 01:35:27 +0000 Subject: [PATCH 092/191] fix test Signed-off-by: Bill Nell --- tests/kernels/moe/test_moe_layer.py | 4 +++- vllm/model_executor/layers/fused_moe/layer.py | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index 13b023c9c855..193f72e2f5ab 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -888,7 +888,9 @@ def make_fused_moe_layer( name, torch.nn.Parameter(value, requires_grad=False) ) - layer.quant_method.process_weights_after_loading(layer.routed_experts) + layer.routed_experts.quant_method.process_weights_after_loading( + layer.routed_experts + ) # Temporary hack until #36286 or #36732 lands if quantization is None: diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index fc00aa9d9bb2..b62a8d9210dd 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -504,7 +504,10 @@ def __init__( # self.quant_method = self.routed_experts.quant_method # Move XXXXXXXXXXXXX - if eplb_manager is not None and not self.quant_method.supports_eplb: + if ( + eplb_manager is not None + and not self.routed_experts.quant_method.supports_eplb + ): # TODO: Add support for additional quantization methods. # The implementation for other quantization methods does not # contain essential differences, but the current quant API From f0ceac41f91da06878fff576a4e8ae9a54f4bf4d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sat, 28 Mar 2026 01:56:31 +0000 Subject: [PATCH 093/191] another fix Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/eplb_manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/eplb_manager.py b/vllm/model_executor/layers/fused_moe/eplb_manager.py index 3e86465aa320..8b046073f1e4 100644 --- a/vllm/model_executor/layers/fused_moe/eplb_manager.py +++ b/vllm/model_executor/layers/fused_moe/eplb_manager.py @@ -133,6 +133,7 @@ def _maybe_make_contiguous( for name, weight in weights if not ( name.startswith("_runner._shared_experts._layer") + or name.startswith("routed_experts.shared_experts._layer") or name.startswith("_runner.gate.") or name.startswith("_runner.routed_input_transform.") or name.startswith("_runner.routed_output_transform.") @@ -146,6 +147,7 @@ def _maybe_make_contiguous( if name not in NON_EXPERT_WEIGHTS and weight.shape != torch.Size([]) and not name.startswith("_runner._shared_experts._layer") + and not name.startswith("routed_experts.shared_experts._layer") # exclude parameters from non-expert submodules, # e.g. gate/shared/transforms. and not name.startswith("_runner.gate.") From dddfd908d44fd8d941e6cce3b708efafea8c42a2 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sat, 28 Mar 2026 04:21:05 +0000 Subject: [PATCH 094/191] incomplete lora fixes. runner/router cleanups Signed-off-by: Bill Nell --- vllm/lora/layers/fused_moe.py | 71 ++++++------ vllm/lora/layers/utils.py | 3 + vllm/lora/lora_model.py | 1 + vllm/lora/model_manager.py | 1 + vllm/model_executor/layers/fused_moe/layer.py | 101 +++++------------- .../layers/fused_moe/routed_experts.py | 6 +- .../fused_moe/runner/default_moe_runner.py | 3 +- .../layers/fused_moe/runner/moe_runner.py | 5 - .../fused_moe/runner/moe_runner_base.py | 10 +- vllm/model_executor/models/gpt_oss.py | 1 + .../model_executor/warmup/deep_gemm_warmup.py | 7 +- 11 files changed, 84 insertions(+), 125 deletions(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 63dc86a677a4..2b9bb4c584fe 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -15,7 +15,7 @@ from vllm.distributed.utils import divide from vllm.lora.layers.base import BaseLayerWithLoRA from vllm.lora.ops.triton_ops.utils import get_lora_op_configs -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import FusedMoE, RoutedExperts from vllm.model_executor.layers.fused_moe.config import ( _get_config_dtype_str, ) @@ -45,8 +45,9 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): def __init__(self, base_layer: FusedMoE) -> None: super().__init__() self.base_layer = base_layer + self._runner = base_layer._runner - assert not self.base_layer.use_ep, ( + assert not self.routed_experts.use_ep, ( "EP support for Fused MoE LoRA is not implemented yet." ) self.tp_size = get_tensor_model_parallel_world_size() @@ -57,6 +58,10 @@ def __init__(self, base_layer: FusedMoE) -> None: self._w13_slices = 2 if base_layer.moe_config.is_act_and_mul else 1 self._inject_lora_into_fused_moe() + @property + def routed_experts(self) -> RoutedExperts: + return self.base_layer.routed_experts + def _normalize_keys(self, config: dict[str, int | None]) -> dict[str, int | None]: normalized_config = {} for key, value in config.items(): @@ -129,14 +134,14 @@ def _get_lora_moe_configs( def _inject_lora_into_fused_moe(self): moe_state_dict = {} - top_k = self.base_layer.top_k + top_k = self.routed_experts.top_k - self.base_layer.ensure_moe_quant_config_init() - quant_config = self.base_layer.routed_experts.quant_method.moe_quant_config + self.routed_experts._ensure_moe_quant_config_init() + quant_config = self.routed_experts.quant_method.moe_quant_config - if getattr(self.base_layer.quant_method, "supports_internal_mk", False): + if getattr(self.routed_experts.quant_method, "supports_internal_mk", False): # Use the existing modular kernel from the quant method - m_fused_moe_fn = self.base_layer.quant_method.moe_kernel + m_fused_moe_fn = self.routed_experts.quant_method.moe_kernel # Don't let the kernel own shared experts so the runner can # overlap them with routed experts via a separate CUDA stream. m_fused_moe_fn.shared_experts = None @@ -147,8 +152,8 @@ def _inject_lora_into_fused_moe(self): prepare_finalize = MoEPrepareAndFinalizeNoDPEPModular() m_fused_moe_fn = FusedMoEKernel( prepare_finalize, - self.base_layer.quant_method.select_gemm_impl( - prepare_finalize, self.base_layer + self.routed_experts.quant_method.select_gemm_impl( + prepare_finalize, self.routed_experts ), ) @@ -212,7 +217,7 @@ def wrapper(*args, **kwargs): naive_block_assignment = ( expert_map is None and num_tokens * top_k * SPARSITY_FACTOR - <= self.base_layer.local_num_experts * self.max_loras + <= self.routed_experts.local_num_experts * self.max_loras ) # get the block size of m from customized config or default config @@ -225,7 +230,7 @@ def wrapper(*args, **kwargs): curr_topk_ids, num_tokens, shrink_config["BLOCK_SIZE_M"], - self.base_layer.local_num_experts, + self.routed_experts.local_num_experts, self.max_loras, self.adapter_enabled, expert_map, @@ -311,7 +316,7 @@ def wrapper(*args, **kwargs): intermediate_cache2 = moe_state_dict["intermediate_cache2"] intermediate_cache3 = args[0] - shard_size_w2 = divide(self.base_layer.hidden_size, self.tp_size) + shard_size_w2 = divide(self.routed_experts.hidden_size, self.tp_size) self.punica_wrapper.add_lora_fused_moe( intermediate_cache3, @@ -350,7 +355,7 @@ def wrapper(*args, **kwargs): ) # TODO(bnell): find a less intrusive way to handle this. self.base_layer._replace_quant_method( - FusedMoEModularMethod(self.base_layer.quant_method, m_fused_moe_fn) + FusedMoEModularMethod(self.routed_experts.quant_method, m_fused_moe_fn) ) def _create_lora_a_weights( @@ -362,11 +367,11 @@ def _create_lora_a_weights( torch.zeros( ( max_loras, - self.base_layer.local_num_experts, + self.routed_experts.local_num_experts, lora_config.max_lora_rank if not self.fully_sharded else divide(lora_config.max_lora_rank, self.tp_size), - self.base_layer.hidden_size, + self.routed_experts.hidden_size, ), dtype=lora_config.lora_dtype, device=self.device, @@ -377,9 +382,9 @@ def _create_lora_a_weights( torch.zeros( ( max_loras, - self.base_layer.local_num_experts, + self.routed_experts.local_num_experts, lora_config.max_lora_rank, - self.base_layer.intermediate_size_per_partition, + self.routed_experts.intermediate_size_per_partition, ), dtype=lora_config.lora_dtype, device=self.device, @@ -391,8 +396,8 @@ def _create_lora_b_weights(self, max_loras: int, lora_config: LoRAConfig): torch.zeros( ( max_loras, - self.base_layer.local_num_experts, - self.base_layer.intermediate_size_per_partition, + self.routed_experts.local_num_experts, + self.routed_experts.intermediate_size_per_partition, lora_config.max_lora_rank, ), dtype=lora_config.lora_dtype, @@ -404,10 +409,10 @@ def _create_lora_b_weights(self, max_loras: int, lora_config: LoRAConfig): torch.zeros( ( max_loras, - self.base_layer.local_num_experts, - self.base_layer.hidden_size + self.routed_experts.local_num_experts, + self.routed_experts.hidden_size if not self.fully_sharded - else divide(self.base_layer.hidden_size, self.tp_size), + else divide(self.routed_experts.hidden_size, self.tp_size), lora_config.max_lora_rank, ), dtype=lora_config.lora_dtype, @@ -437,7 +442,7 @@ def create_lora_weights( self.lora_a_stacked = [] self.lora_b_stacked = [] for lora_id in range(max_loras): - for experts_id in range(self.base_layer.local_num_experts): + for experts_id in range(self.routed_experts.local_num_experts): # For gated MoE: gate_proj (w1), down_proj (w2), up_proj (w3) # For non-gated MoE: up_proj (w1), down_proj (w2) self.lora_a_stacked.append( @@ -484,7 +489,7 @@ def _slice_w13_b(self, w13_lora_b: torch.Tensor): return w13_lora_b # w13_lora_b shape (num_experts,output_size,rank) - shard_size = self.base_layer.intermediate_size_per_partition + shard_size = self.routed_experts.intermediate_size_per_partition start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size @@ -497,7 +502,7 @@ def _slice_w2_a(self, w2_lora_a: torch.Tensor) -> torch.Tensor: if self.tp_size == 1: return w2_lora_a # w2_lora_a shape (num_experts,rank,input_size) - shard_size = self.base_layer.intermediate_size_per_partition + shard_size = self.routed_experts.intermediate_size_per_partition start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size @@ -594,7 +599,7 @@ def forward(self, *args, **kwargs): @property def quant_method(self): - return self.base_layer.quant_method + return self.routed_experts.quant_method @property def is_internal_router(self) -> bool: @@ -624,8 +629,8 @@ def _create_lora_b_weights(self, max_loras, lora_config): torch.zeros( ( max_loras, - self.base_layer.local_num_experts, - self.base_layer.intermediate_size_per_partition * 2, + self.routed_experts.local_num_experts, + self.routed_experts.intermediate_size_per_partition * 2, lora_config.max_lora_rank, ), dtype=lora_config.lora_dtype, @@ -637,10 +642,10 @@ def _create_lora_b_weights(self, max_loras, lora_config): torch.zeros( ( max_loras, - self.base_layer.local_num_experts, - self.base_layer.hidden_size + self.routed_experts.local_num_experts, + self.routed_experts.hidden_size if not self.fully_sharded - else divide(self.base_layer.hidden_size, self.tp_size), + else divide(self.routed_experts.hidden_size, self.tp_size), lora_config.max_lora_rank, ), dtype=lora_config.lora_dtype, @@ -673,7 +678,7 @@ def _slice_w13_b(self, w13_lora_b: torch.Tensor): return w13_lora_b # w13_lora_b shape (num_experts,output_size,rank) - shard_size = self.base_layer.intermediate_size_per_partition + shard_size = self.routed_experts.intermediate_size_per_partition start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size # HACK: Currently, only GPT-OSS is in interleaved order @@ -761,7 +766,7 @@ def w2_output_size(self): """ Full size """ - return self.base_layer.hidden_size + return self.routed_experts.hidden_size @classmethod def can_replace_layer( diff --git a/vllm/lora/layers/utils.py b/vllm/lora/layers/utils.py index c19b097586f5..e2b1f6cf9481 100644 --- a/vllm/lora/layers/utils.py +++ b/vllm/lora/layers/utils.py @@ -32,6 +32,9 @@ def __post_init__(self): def _get_lora_device(base_layer: nn.Module) -> torch.device: # code borrowed from https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/vllm/lora/layers.py#L34 """Returns the device for where to place the LoRA tensors.""" + if hasattr(base_layer, "routed_experts"): + base_layer = base_layer.routed_experts + # unquantizedLinear if hasattr(base_layer, "weight"): return base_layer.weight.device diff --git a/vllm/lora/lora_model.py b/vllm/lora/lora_model.py index 7c1dd39bb5e3..8199601a3f6b 100644 --- a/vllm/lora/lora_model.py +++ b/vllm/lora/lora_model.py @@ -175,6 +175,7 @@ def check_unexpected_modules(modules: dict): continue module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper) # Case for expert lora weights + print(f"XXXXXXXXXX MN0 {module_name}") if ".experts" in module_name: expert_idx = module_name.find(".experts") expert_suffix = module_name[expert_idx + 1 :] diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index 9d3772560433..e72ec6a8f402 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -648,6 +648,7 @@ def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None: replaced_module_name = module_name.removeprefix("model.") if lora_model.check_lora_name(replaced_module_name): module_name = replaced_module_name + print(f"XXXXXXXXXX MN1 {module_name}") if module_name.endswith(".experts"): if self._is_non_gated_moe and len(replacement_loras) > 0: replacement_loras = self._pad_lora_pairs_to_triplets( diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index b62a8d9210dd..90563dea189d 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -403,7 +403,7 @@ def __init__( eplb_manager=eplb_manager, # TODO(bnell): once we can construct the MK at init time, we # can make this a value. - indices_type_getter=lambda: self._runner.quant_method.topk_indices_dtype, + indices_type_getter=lambda: self._runner.routed_experts.quant_method.topk_indices_dtype, # noqa: E501 zero_expert_type=zero_expert_type, num_logical_experts=self.logical_num_experts, ) @@ -479,7 +479,7 @@ def __init__( # Create RoutedExperts instance BEFORE create_weights() # This will hold all expert weight parameters - self.routed_experts = RoutedExperts( + routed_experts = RoutedExperts( self.layer_name, params_dtype, unpadded_hidden_size, @@ -489,6 +489,7 @@ def __init__( expert_map_manager=self.expert_map_manager, # Extra params that are needed by quant_methods, pass along for now rocm_aiter_fmoe_enabled=self.rocm_aiter_fmoe_enabled, + top_k=top_k, use_grouped_topk=use_grouped_topk, num_expert_group=num_expert_group, topk_group=topk_group, @@ -500,14 +501,12 @@ def __init__( activation=MoEActivation.from_str(activation), ) - # HACK - # self.quant_method = self.routed_experts.quant_method + # TODO(bnell): this needs to be stored as a parameter for weight loading. + # ditch this eventually. + self.routed_experts = routed_experts # Move XXXXXXXXXXXXX - if ( - eplb_manager is not None - and not self.routed_experts.quant_method.supports_eplb - ): + if eplb_manager is not None and not routed_experts.quant_method.supports_eplb: # TODO: Add support for additional quantization methods. # The implementation for other quantization methods does not # contain essential differences, but the current quant API @@ -530,14 +529,14 @@ def __init__( routed_output_transform=routed_output_transform, gate=gate, shared_experts=shared_experts, - routed_experts=self.routed_experts, + routed_experts=routed_experts, enable_dbo=vllm_config.parallel_config.enable_dbo, apply_scale_to_output=apply_scale_to_output, routed_scaling_factor=routed_scaling_factor, ) # HACK XXXXXXXXXXXXXXXXXXXXXXXX - self.routed_experts.shared_experts = self._runner.shared_experts + routed_experts.shared_experts = self._runner.shared_experts # For smuggling this layer into the fused moe custom op register_layer_for_moe_forward_op(vllm_config, self) @@ -577,20 +576,22 @@ def maybe_init_modular_kernel(self) -> None: # NOTE(rob): WIP refactor. For quant methods that own the MK # we create the MK during process_weights_after_loading. if ( - self._runner.quant_method.supports_internal_mk - or self._runner.quant_method.is_monolithic + self._runner.routed_experts.quant_method.supports_internal_mk + or self._runner.routed_experts.quant_method.is_monolithic ): return None - self.routed_experts._ensure_moe_quant_config_init() + self._runner.routed_experts._ensure_moe_quant_config_init() # routing_tables only needed for round-robin expert placement with # DeepEP all2all backend. routing_tables = self._maybe_init_expert_routing_tables() - if isinstance(self._runner.quant_method, FusedMoEModularMethod): - base_quant_method = self._runner.quant_method.old_quant_method + if isinstance(self._runner.routed_experts.quant_method, FusedMoEModularMethod): + base_quant_method = ( + self._runner.routed_experts.quant_method.old_quant_method + ) else: - base_quant_method = self._runner.quant_method + base_quant_method = self._runner.routed_experts.quant_method prepare_finalize = base_quant_method.maybe_make_prepare_finalize( routing_tables=routing_tables @@ -676,7 +677,7 @@ def is_internal_router(self) -> bool: @property def is_monolithic(self) -> bool: - return self._runner.quant_method.is_monolithic + return self._runner.routed_experts.quant_method.is_monolithic @property def shared_experts(self) -> SharedExperts | None: @@ -688,19 +689,21 @@ def shared_experts(self) -> SharedExperts | None: @property def expert_map(self) -> torch.Tensor | None: - return self.routed_experts.expert_map + return self._runner.routed_experts.expert_map def _maybe_init_expert_routing_tables( self, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: - return self.routed_experts._maybe_init_expert_routing_tables() + return self._runner.routed_experts._maybe_init_expert_routing_tables() def update_expert_map(self): - self.routed_experts.update_expert_map() + self._runner.routed_experts.update_expert_map() def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: """Map global expert ID to local expert ID.""" - return self.routed_experts._map_global_expert_id_to_local_expert_id(expert_id) + return self._runner.routed_experts._map_global_expert_id_to_local_expert_id( + expert_id + ) # # EPLB @@ -772,57 +775,6 @@ def make_expert_params_mapping( num_redundant_experts, ) - # - # Weight Loading (Delegated to RoutedExperts) - # - - # @overload - # def weight_loader( - # self, - # param: torch.nn.Parameter, - # loaded_weight: torch.Tensor, - # weight_name: str, - # shard_id: str, - # expert_id: int, - # return_success: Literal[False], - # ) -> None: ... - - # @overload - # def weight_loader( - # self, - # param: torch.nn.Parameter, - # loaded_weight: torch.Tensor, - # weight_name: str, - # shard_id: str, - # expert_id: int, - # return_success: Literal[True], - # ) -> bool: ... - - # def weight_loader( - # self, - # param: torch.nn.Parameter, - # loaded_weight: torch.Tensor, - # weight_name: str, - # shard_id: str, - # expert_id: int, - # return_success: bool = False, - # ) -> bool | None: - # """Delegate to RoutedExperts.""" - # return self.routed_experts.weight_loader( - # param=param, - # loaded_weight=loaded_weight, - # weight_name=weight_name, - # shard_id=shard_id, - # expert_id=expert_id, - # return_success=return_success, - # ) - - # def load_weights( - # self, weights: Iterable[tuple[str, torch.Tensor]] - # ) -> Iterable[str]: - # """Delegate to RoutedExperts.""" - # return self.routed_experts.load_weights(weights) - # # Execution # @@ -843,8 +795,3 @@ def forward_cuda( router_logits: torch.Tensor, ) -> torch.Tensor: return self.forward_native(hidden_states, router_logits) - - -# Mark the FusedMoE weight_loader as supporting MoE-specific parameters -# to avoid expensive runtime reflection in model loading code -# FusedMoE.weight_loader.supports_moe_loading = True # type: ignore[attr-defined] diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index 02579ada9a88..a332d01c53fe 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -141,6 +141,10 @@ def _ensure_moe_quant_config_init(self): self.quant_method.get_fused_moe_quant_config(self) ) + @property + def use_ep(self) -> bool: + return self.moe_config.moe_parallel_config.use_ep + @property def expert_map(self) -> torch.Tensor | None: return ( @@ -428,7 +432,7 @@ def weight_loader( is_transposed = getattr(param, "is_transposed", False) # compressed-tensors checkpoints with packed weights are stored flipped - # TODO (mgoin): check self.layer._runner.quant_method.quant_config.quant_format + # TODO (mgoin): check self.quant_method.quant_config.quant_format # against known CompressionFormat enum values that have this quality if quant_method_name in ( "CompressedTensorsWNA16MarlinMoEMethod", diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index 3d22c7dbb756..b8e94708c678 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -80,7 +80,8 @@ def __init__( @property def do_naive_dispatch_combine(self) -> bool: return ( - self.moe_config.dp_size > 1 and not self.quant_method.supports_internal_mk + self.moe_config.dp_size > 1 + and not self.routed_experts.quant_method.supports_internal_mk ) def _maybe_dispatch( diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index 24105709e8a1..14b28f123b67 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -34,11 +34,6 @@ def forward( ) -> torch.Tensor: raise NotImplementedError - @property - @abstractmethod - def quant_method(self) -> FusedMoEMethodBase: - raise NotImplementedError - @property @abstractmethod def shared_experts(self) -> SharedExperts | None: diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index 63d4ac418fda..abf45a51bcff 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -230,7 +230,7 @@ def is_internal_router(self) -> bool: return self.gate is not None @property - def quant_method(self) -> FusedMoEMethodBase: + def _quant_method(self) -> FusedMoEMethodBase: return self.routed_experts.quant_method @property @@ -333,8 +333,8 @@ def _must_reduce_shared_expert_output(self) -> bool: """ return ( self._shared_experts is not None - and self.quant_method.moe_kernel is not None - and self.quant_method.moe_kernel.output_is_reduced() + and self.routed_experts.quant_method.moe_kernel is not None + and self.routed_experts.quant_method.moe_kernel.output_is_reduced() ) def _maybe_reduce_shared_expert_output( @@ -412,7 +412,7 @@ def _maybe_pad_hidden_states( ) transformed_hidden_dim = hidden_states.shape[-1] if ( - not self.quant_method.skip_forward_padding + not self.routed_experts.quant_method.skip_forward_padding and self.moe_config.hidden_dim != transformed_hidden_dim ): hidden_states = F.pad( @@ -463,7 +463,7 @@ def _apply_quant_method( SharedExpertsOrder.NO_OVERLAP, ) - if self.quant_method.is_monolithic: + if self.routed_experts.quant_method.is_monolithic: # Monolithic kernels: pass router_logits to routed_experts fused_out = self.routed_experts.forward( x=hidden_states, diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index bd43d35c3dad..96f1b22f902c 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -566,6 +566,7 @@ def _get_moe_weight_dtype(layer_id: int = 0) -> str | None: Returns: Weight dtype string (e.g., "mxfp4", "fp8") or None if not available """ + # XXXXXXXXXXXXXXXXXXX if hasattr(self.layers[layer_id].mlp.experts.quant_method, "weight_dtype"): return self.layers[layer_id].mlp.experts.quant_method.weight_dtype return None diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index babee6e081e8..74229699105b 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -160,7 +160,8 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool: if not isinstance(module, FusedMoE): return False - moe_quant_config = module.quant_method.get_fused_moe_quant_config(module) + quant_method = module.routed_experts.quant_method + moe_quant_config = quant_method.get_fused_moe_quant_config(module.routed_experts) if ( moe_quant_config is None @@ -169,13 +170,13 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool: ): return False - if not isinstance(module.quant_method, FusedMoEModularMethod): + if not isinstance(quant_method, FusedMoEModularMethod): # modular kernels could invoke deep_gemm_moe_fp8 return True # Further check if the ModularKernel implementation uses the DeepGemmExperts return isinstance( - module.quant_method.moe_kernel, (DeepGemmExperts, TritonOrDeepGemmExperts) + quant_method.moe_kernel, (DeepGemmExperts, TritonOrDeepGemmExperts) ) From e7e1ab11913e8673c2bddaffb388ba6515a599f3 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 30 Mar 2026 19:57:26 +0000 Subject: [PATCH 095/191] revert hacks Signed-off-by: Bill Nell --- vllm/lora/lora_model.py | 1 - vllm/lora/model_manager.py | 1 - 2 files changed, 2 deletions(-) diff --git a/vllm/lora/lora_model.py b/vllm/lora/lora_model.py index 8199601a3f6b..7c1dd39bb5e3 100644 --- a/vllm/lora/lora_model.py +++ b/vllm/lora/lora_model.py @@ -175,7 +175,6 @@ def check_unexpected_modules(modules: dict): continue module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper) # Case for expert lora weights - print(f"XXXXXXXXXX MN0 {module_name}") if ".experts" in module_name: expert_idx = module_name.find(".experts") expert_suffix = module_name[expert_idx + 1 :] diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index e72ec6a8f402..9d3772560433 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -648,7 +648,6 @@ def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None: replaced_module_name = module_name.removeprefix("model.") if lora_model.check_lora_name(replaced_module_name): module_name = replaced_module_name - print(f"XXXXXXXXXX MN1 {module_name}") if module_name.endswith(".experts"): if self._is_non_gated_moe and len(replacement_loras) > 0: replacement_loras = self._pad_lora_pairs_to_triplets( From 629f1edb7ce0cbd8ffb631b71f38375ff3b88d51 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 2 Apr 2026 22:50:33 +0000 Subject: [PATCH 096/191] more cleanups Signed-off-by: Bill Nell --- .../model_executor/layers/fused_moe/config.py | 26 ++ .../layers/fused_moe/expert_map_manager.py | 66 +++- .../fused_moe/fused_moe_modular_method.py | 4 +- vllm/model_executor/layers/fused_moe/layer.py | 291 +++++------------- .../layers/fused_moe/modular_kernel.py | 2 +- .../layers/fused_moe/routed_experts.py | 115 ++++++- .../layers/fused_moe/shared_fused_moe.py | 2 +- .../fused_moe/unquantized_fused_moe_method.py | 3 +- .../layers/quantization/modelopt.py | 4 +- .../layers/quantization/quark/quark_moe.py | 5 +- vllm/model_executor/models/AXK1.py | 1 - vllm/model_executor/models/aria.py | 1 - .../models/bailing_moe_linear.py | 3 +- vllm/model_executor/models/dbrx.py | 1 - vllm/model_executor/models/sarvam.py | 1 - 15 files changed, 277 insertions(+), 248 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 6434582f987d..c730a9e578c3 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -1190,7 +1190,13 @@ class FusedMoEConfig: # kernel is free to use inplace or not. disable_inplace: bool = True + # Set by __post_init__ + rocm_aiter_fmoe_enabled: bool = False + aiter_fmoe_shared_expert_enabled: bool = False + def __post_init__(self): + from vllm._aiter_ops import rocm_aiter_ops + if self.dp_size > 1: logger.debug_once( "Using FusedMoEConfig::max_num_tokens=%d", self.max_num_tokens @@ -1201,6 +1207,26 @@ def __post_init__(self): if self.router_logits_dtype is None: self.router_logits_dtype = self.in_dtype + if self.is_act_and_mul: + self.rocm_aiter_fmoe_enabled = rocm_aiter_ops.is_fused_moe_enabled() + self.aiter_fmoe_shared_expert_enabled = ( + rocm_aiter_ops.is_fusion_moe_shared_experts_enabled() + ) + + if self.use_mori_kernels: + assert self.rocm_aiter_fmoe_enabled, ( + "Mori needs to be used with aiter fused_moe for now." + ) + assert not self.aiter_fmoe_shared_expert_enabled, ( + "Mori does not support fusion shared expert now. " + "Turn it off by setting VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0" + ) + + if not self.is_act_and_mul and not current_platform.is_cuda_alike(): + raise NotImplementedError( + "is_act_and_mul=False is supported only for CUDA and ROCm for now" + ) + @property def tp_size(self): return self.moe_parallel_config.tp_size diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 3c5dfcd55d2b..0e5c3e7a2c99 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -12,6 +12,9 @@ from vllm.config.parallel import ExpertPlacementStrategy from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig +from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( + init_aiter_topK_meta_data, +) logger = init_logger(__name__) @@ -123,6 +126,8 @@ class ExpertMapManager: def __init__( self, + max_num_batched_tokens: int, + top_k: int, global_num_experts: int, logical_num_experts: int, moe_parallel_config: FusedMoEParallelConfig, @@ -162,19 +167,74 @@ def __init__( # Initialize routing tables if needed self._maybe_init_routing_tables() + self._init_aiter_shared_experts_topK_buffer( + dp_size=self.moe_parallel_config.dp_size, + top_k=top_k, + max_num_batched_tokens=max_num_batched_tokens, + ) + + if self.use_ep and self.rocm_aiter_enabled: + expert_mask = self.expert_mask + assert expert_mask is None or torch.all( + (expert_mask == 0) | (expert_mask == 1) + ), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s." + + # Log EP configuration (move into EMM?) + if self.use_ep: + logger.info_once( + "[EP Rank %s/%s] Expert parallelism is enabled. Expert " + "placement strategy: %s. Local/global" + " number of experts: %s/%s. Experts local to global index map:" + " %s.", + self.ep_rank, + self.ep_size, + self.placement_strategy, + self.local_num_experts, + self.global_num_experts, + self.get_compressed_map_string(), + ) + + def _init_aiter_shared_experts_topK_buffer( + self, + dp_size: int, + top_k: int, + max_num_batched_tokens: int, + ): + if self.num_fused_shared_experts > 0: + init_aiter_topK_meta_data( + n_routed_experts=self.global_num_experts, + n_shared_experts=self.num_fused_shared_experts, + top_k=top_k, + tp_rank=self.ep_rank if self.use_ep else self.tp_rank, + tp_size=self.ep_size if self.use_ep else self.tp_size, + shared_experts_score=1.0, + max_num_tokens=max_num_batched_tokens * dp_size, + is_EP=self.use_ep, + ) + self._local_num_experts += self.num_fused_shared_experts + + @property + def use_ep(self) -> int: + return self.moe_parallel_config.use_ep + @property def ep_size(self) -> int: - """Expert parallelism world size.""" return self.moe_parallel_config.ep_size @property def ep_rank(self) -> int: - """Expert parallelism rank.""" return self.moe_parallel_config.ep_rank + @property + def tp_size(self) -> int: + return self.moe_parallel_config.tp_size + + @property + def tp_rank(self) -> int: + return self.moe_parallel_config.tp_rank + @property def local_num_experts(self) -> int: - """Number of experts assigned to this rank.""" return self._local_num_experts @property diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index ac583671c371..321facc20969 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -50,7 +50,7 @@ def __init__( @staticmethod def make( - moe_layer: torch.nn.Module, + routed_experts: "RoutedExperts", old_quant_method: FusedMoEMethodBase, prepare_finalize: FusedMoEPrepareAndFinalizeModular, shared_experts: SharedExperts | None, @@ -60,7 +60,7 @@ def make( old_quant_method, FusedMoEKernel( prepare_finalize, - old_quant_method.select_gemm_impl(prepare_finalize, moe_layer), + old_quant_method.select_gemm_impl(prepare_finalize, routed_experts), shared_experts=shared_experts, inplace=inplace, ), diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 90563dea189d..ac3b792570ba 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -20,7 +20,6 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, - RoutingMethodType, ) from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager from vllm.model_executor.layers.fused_moe.expert_map_manager import ( @@ -32,9 +31,6 @@ from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import ( FusedMoEModularMethod, ) -from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( - init_aiter_topK_meta_data, -) from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts from vllm.model_executor.layers.fused_moe.router.router_factory import ( create_fused_moe_router, @@ -45,21 +41,17 @@ from vllm.model_executor.layers.fused_moe.runner.moe_runner_factory import ( create_moe_runner, ) -from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( - SharedExperts, -) from vllm.model_executor.layers.fused_moe.utils import ( disable_inplace, ) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, ) -from vllm.platforms import current_platform logger = init_logger(__name__) -# Should be method? only used in layer +# TODO: merge with class method def determine_expert_placement_strategy( expert_placement_strategy: ExpertPlacementStrategy, moe_parallel_config: FusedMoEParallelConfig, @@ -213,11 +205,11 @@ def __init__( super().__init__() # IMPORTANT: RoutedExperts must have same layer_name/prefix as FusedMoE for now + # This is still needed self.layer_name = prefix if params_dtype is None: params_dtype = torch.get_default_dtype() - self.params_dtype = params_dtype vllm_config = get_current_vllm_config() @@ -237,32 +229,29 @@ def __init__( dp_size_ = dp_size if dp_size is not None else get_dp_group().world_size pcp_size_ = pcp_size if pcp_size is not None else get_pcp_group().world_size - self.is_sequence_parallel = is_sequence_parallel - self.sp_size = tp_size_ if is_sequence_parallel else 1 + is_sequence_parallel = is_sequence_parallel + sp_size = tp_size_ if is_sequence_parallel else 1 - self.moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make( + moe_parallel_config = FusedMoEParallelConfig.make( tp_size_=tp_size_, pcp_size_=pcp_size_, dp_size_=dp_size_, - sp_size_=self.sp_size, + sp_size_=sp_size, vllm_parallel_config=vllm_config.parallel_config, ) - assert self.moe_parallel_config.is_sequence_parallel == is_sequence_parallel + assert moe_parallel_config.is_sequence_parallel == is_sequence_parallel - logger.debug("FusedMoEParallelConfig = %s", str(self.moe_parallel_config)) + logger.debug("FusedMoEParallelConfig = %s", str(moe_parallel_config)) - self.global_num_experts = num_experts + num_redundant_experts - self.logical_num_experts = num_experts + global_num_experts = num_experts + num_redundant_experts + logical_num_experts = num_experts # Initialize EPLB manager (or None?) eplb_manager: EplbManager | None = None if enable_eplb: eplb_manager = EplbManager(num_redundant_experts=num_redundant_experts) - # Expert mapping used in self.load_weights - self.expert_mapping = expert_mapping - expert_placement_strategy: ExpertPlacementStrategy = ( vllm_config.parallel_config.expert_placement_strategy ) @@ -270,34 +259,29 @@ def __init__( # ROCm aiter shared experts fusion # AITER only supports gated activations (silu/gelu), so disable it # for non-gated MoE (is_act_and_mul=False) - self.rocm_aiter_fmoe_enabled = ( + rocm_aiter_fmoe_enabled = ( rocm_aiter_ops.is_fused_moe_enabled() and is_act_and_mul ) - self.aiter_fmoe_shared_expert_enabled = ( + aiter_fmoe_shared_expert_enabled = ( rocm_aiter_ops.is_fusion_moe_shared_experts_enabled() and is_act_and_mul ) - self.num_fused_shared_experts = ( + num_fused_shared_experts = ( n_shared_experts - if n_shared_experts is not None and self.aiter_fmoe_shared_expert_enabled + if n_shared_experts is not None and aiter_fmoe_shared_expert_enabled else 0 ) - if ( - not self.aiter_fmoe_shared_expert_enabled - and self.num_fused_shared_experts != 0 - ): + if not aiter_fmoe_shared_expert_enabled and num_fused_shared_experts != 0: raise ValueError( "n_shared_experts is only supported on ROCm aiter when " "VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS is enabled" ) # Determine expert maps - if self.use_ep: + if moe_parallel_config.use_ep: if eplb_manager is not None: # Validate EPLB configuration - eplb_manager.validate_configuration( - self.global_num_experts, self.ep_size - ) + eplb_manager.validate_configuration(global_num_experts, ep_size) else: assert num_redundant_experts == 0, ( "Redundant experts are only supported with EPLB." @@ -306,7 +290,7 @@ def __init__( # Determine expert placement strategy before creating manager expert_placement_strategy_effective = determine_expert_placement_strategy( expert_placement_strategy=expert_placement_strategy, - moe_parallel_config=self.moe_parallel_config, + moe_parallel_config=moe_parallel_config, num_expert_group=num_expert_group, num_redundant_experts=num_redundant_experts, enable_eplb=eplb_manager is not None, @@ -316,71 +300,20 @@ def __init__( # Create expert map manager self.expert_map_manager = ExpertMapManager( - global_num_experts=self.global_num_experts, - logical_num_experts=self.logical_num_experts, - moe_parallel_config=self.moe_parallel_config, + max_num_batched_tokens=vllm_config.scheduler_config.max_num_batched_tokens, + top_k=top_k, + global_num_experts=global_num_experts, + logical_num_experts=logical_num_experts, + moe_parallel_config=moe_parallel_config, placement_strategy=expert_placement_strategy_effective, - num_fused_shared_experts=self.num_fused_shared_experts, - rocm_aiter_enabled=self.rocm_aiter_fmoe_enabled, + num_fused_shared_experts=num_fused_shared_experts, + rocm_aiter_enabled=rocm_aiter_fmoe_enabled, device=vllm_config.device_config.device, ) - # Register buffers for state_dict compatibility - # if self.expert_map_manager.expert_map is not None: - # self.register_buffer("_expert_map", self.expert_map_manager.expert_map) - # - # if self.expert_map_manager.expert_mask is not None: - # self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) - - # Log EP configuration (move into EMM?) - if self.use_ep: - logger.info_once( - "[EP Rank %s/%s] Expert parallelism is enabled. Expert " - "placement strategy: %s. Local/global" - " number of experts: %s/%s. Experts local to global index map:" - " %s.", - self.ep_rank, - self.ep_size, - self.expert_map_manager.placement_strategy, - self.expert_map_manager.local_num_experts, - self.expert_map_manager.global_num_experts, - self.expert_map_manager.get_compressed_map_string(), - ) - - self.top_k = top_k - - # move into EMM? - self._init_aiter_shared_experts_topK_buffer( - vllm_config=vllm_config, dp_size=dp_size_ - ) - - # XXXXX move into EMM (this is just an assert) - if self.use_ep and self.rocm_aiter_fmoe_enabled: - expert_mask = self.expert_map_manager.expert_mask - assert expert_mask is None or torch.all( - (expert_mask == 0) | (expert_mask == 1) - ), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s." - - assert intermediate_size % self.tp_size == 0 - self.intermediate_size_per_partition = intermediate_size // self.tp_size - self.renormalize = renormalize - - # TODO(bnell): these attributes are only used by monolithic kernels. - # Put them in a MoERouterConfig dataclass? - self.use_grouped_topk = use_grouped_topk - if self.use_grouped_topk: - assert num_expert_group is not None and topk_group is not None - self.num_expert_group = num_expert_group - self.topk_group = topk_group - self.custom_routing_function = custom_routing_function - self.scoring_func = scoring_func - self.routed_scaling_factor = routed_scaling_factor - self.e_score_correction_bias = e_score_correction_bias - # TODO(bnell): end attributes - - # Store in runner? - self.apply_router_weight_on_input = apply_router_weight_on_input - self.activation = MoEActivation.from_str(activation) + tp_size = moe_parallel_config.tp_size + assert intermediate_size % tp_size == 0 + intermediate_size_per_partition = intermediate_size // tp_size self._runner: MoERunner @@ -388,7 +321,7 @@ def __init__( # monolithic. router = create_fused_moe_router( top_k=top_k, - global_num_experts=self.global_num_experts, + global_num_experts=global_num_experts, renormalize=renormalize, use_grouped_topk=use_grouped_topk, num_expert_group=num_expert_group, @@ -399,29 +332,25 @@ def __init__( if not apply_scale_to_output else 1.0, e_score_correction_bias=e_score_correction_bias, - num_fused_shared_experts=self.num_fused_shared_experts, + num_fused_shared_experts=num_fused_shared_experts, eplb_manager=eplb_manager, # TODO(bnell): once we can construct the MK at init time, we # can make this a value. indices_type_getter=lambda: self._runner.routed_experts.quant_method.topk_indices_dtype, # noqa: E501 zero_expert_type=zero_expert_type, - num_logical_experts=self.logical_num_experts, + num_logical_experts=logical_num_experts, ) - self.routing_method_type: RoutingMethodType = router.routing_method_type - # TODO(bnell): is this redundant now? # When using zero experts, slice e_score_correction_bias to cover # only real experts, for compatibility with monolithic kernels that # read it directly. if zero_expert_type is not None and e_score_correction_bias is not None: - self.e_score_correction_bias = e_score_correction_bias[ - : self.logical_num_experts - ] + self.e_score_correction_bias = e_score_correction_bias[logical_num_experts] # Round up hidden size before creating moe_config. # This way moe_config is created with the correct hidden_size from the start. unpadded_hidden_size = hidden_size - self.model_type = ( + model_type = ( vllm_config.model_config.hf_config.model_type if vllm_config.model_config is not None else None @@ -429,20 +358,21 @@ def __init__( hidden_size = maybe_roundup_hidden_size( hidden_size=hidden_size, act_dtype=moe_in_dtype, - moe_parallel_config=self.moe_parallel_config, + moe_parallel_config=moe_parallel_config, is_lora_enabled=vllm_config.lora_config is not None, - model_type=self.model_type, + model_type=model_type, ) - self.hidden_size = hidden_size - self.moe_config = FusedMoEConfig( - num_experts=self.global_num_experts, + moe_activation = MoEActivation.from_str(activation) + + moe_config = FusedMoEConfig( + num_experts=global_num_experts, experts_per_token=top_k, hidden_dim=hidden_size, - intermediate_size_per_partition=self.intermediate_size_per_partition, - num_local_experts=self.local_num_experts, - num_logical_experts=self.logical_num_experts, - moe_parallel_config=self.moe_parallel_config, + intermediate_size_per_partition=intermediate_size_per_partition, + num_local_experts=self.expert_map_manager.local_num_experts, + num_logical_experts=logical_num_experts, + moe_parallel_config=moe_parallel_config, in_dtype=moe_in_dtype, moe_backend=vllm_config.kernel_config.moe_backend, router_logits_dtype=router_logits_dtype, @@ -450,32 +380,16 @@ def __init__( has_bias=has_bias, is_act_and_mul=is_act_and_mul, is_lora_enabled=vllm_config.lora_config is not None, - activation=self.activation, + activation=moe_activation, device=vllm_config.device_config.device, - routing_method=self.routing_method_type, + routing_method=router.routing_method_type, # TODO: in_dtype == out_dtype? disable_inplace=disable_inplace() or shared_experts is not None, ) - # Move XXXXXXXXXXXXX - if self.moe_config.use_mori_kernels: - assert self.rocm_aiter_fmoe_enabled, ( - "Mori needs to be used with aiter fused_moe for now." - ) - assert not self.aiter_fmoe_shared_expert_enabled, ( - "Mori does not support fusion shared expert now. " - "Turn it off by setting VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0" - ) - - self.quant_config = quant_config + quant_config = quant_config - logger.debug("FusedMoEConfig = %s", self.moe_config) - - # Move XXXXXXXXXXXXX - if not self.moe_config.is_act_and_mul and not current_platform.is_cuda_alike(): - raise NotImplementedError( - "is_act_and_mul=False is supported only for CUDA and ROCm for now" - ) + logger.debug("FusedMoEConfig = %s", moe_config) # Create RoutedExperts instance BEFORE create_weights() # This will hold all expert weight parameters @@ -484,11 +398,11 @@ def __init__( params_dtype, unpadded_hidden_size, intermediate_size, - self.moe_config, - self.quant_config, + moe_config, + quant_config, expert_map_manager=self.expert_map_manager, # Extra params that are needed by quant_methods, pass along for now - rocm_aiter_fmoe_enabled=self.rocm_aiter_fmoe_enabled, + rocm_aiter_fmoe_enabled=rocm_aiter_fmoe_enabled, # get from moe config top_k=top_k, use_grouped_topk=use_grouped_topk, num_expert_group=num_expert_group, @@ -496,17 +410,18 @@ def __init__( custom_routing_function=custom_routing_function, scoring_func=scoring_func, routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, + e_score_correction_bias=e_score_correction_bias, # get from router? apply_router_weight_on_input=apply_router_weight_on_input, - activation=MoEActivation.from_str(activation), + activation=moe_activation, ) # TODO(bnell): this needs to be stored as a parameter for weight loading. # ditch this eventually. self.routed_experts = routed_experts - # Move XXXXXXXXXXXXX - if eplb_manager is not None and not routed_experts.quant_method.supports_eplb: + # Where to move this? + quant_method = routed_experts.quant_method + if enable_eplb and not quant_method.supports_eplb: # TODO: Add support for additional quantization methods. # The implementation for other quantization methods does not # contain essential differences, but the current quant API @@ -515,7 +430,7 @@ def __init__( # If you plan to add support for more quantization methods, # please refer to the implementation in `Fp8MoEMethod`. raise NotImplementedError( - f"EPLB is not supported {self.quant_method.__class__.__name__}." + f"EPLB is not supported {quant_method.__class__.__name__}." ) # Storing the runner in the FusedMoE is an intermediate state, eventually @@ -523,7 +438,7 @@ def __init__( # for MoE ops. self._runner = create_moe_runner( layer_name=self.layer_name, - moe_config=self.moe_config, + moe_config=moe_config, router=router, routed_input_transform=routed_input_transform, routed_output_transform=routed_output_transform, @@ -541,18 +456,6 @@ def __init__( # For smuggling this layer into the fused moe custom op register_layer_for_moe_forward_op(vllm_config, self) - def extra_repr(self) -> str: - s = ( - f"global_num_experts={self.global_num_experts}, " - f"local_num_experts={self.local_num_experts}, " - f"top_k={self.top_k}, " - f"intermediate_size_per_partition={self.intermediate_size_per_partition}, " # noqa: E501 - f"tp_size={self.tp_size},\n" - f"ep_size={self.ep_size}, " - ) - - return s - # TODO(bnell): This method is provided as a hook so vllm/lora/layers/fused_moe.py # and vllm/distributed/elastic_ep/elastic_execute.py # can safely swap out the quant_method. We should figure out a less @@ -560,14 +463,6 @@ def extra_repr(self) -> str: def _replace_quant_method(self, mk: FusedMoEMethodBase): self._runner._replace_quant_method(mk) - # def _ensure_moe_quant_config_init(self): - # if self._runner.quant_method.moe_quant_config is None: - # # Note: the moe_quant_config can't be constructed until after - # # weight loading post processing. - # self._runner.quant_method.moe_quant_config = ( - # self._runner.quant_method.get_fused_moe_quant_config(self) - # ) - # Note: maybe_init_modular_kernel should only be called by # prepare_communication_buffer_for_model. # This is called after all weight loading and post-processing, so it @@ -605,8 +500,8 @@ def maybe_init_modular_kernel(self) -> None: self, base_quant_method, prepare_finalize, - self.shared_experts, - inplace=not self.moe_config.disable_inplace, + self._runner.shared_experts, + inplace=not base_quant_method.moe.disable_inplace, ) ) @@ -621,40 +516,33 @@ def layer_id(self): return extract_layer_index(self.layer_name) - @property - def tp_size(self): - return self.moe_parallel_config.tp_size - - @property - def ep_size(self): - return self.moe_parallel_config.ep_size + # + # Attributes still needed by models + # @property - def tp_rank(self): - return self.moe_parallel_config.tp_rank + def is_monolithic(self) -> bool: + return self._runner.routed_experts.quant_method.is_monolithic @property - def ep_rank(self): - return self.moe_parallel_config.ep_rank + def activation(self) -> MoEActivation: + return self._runner.routed_experts.activation @property - def use_ep(self): - return self.moe_parallel_config.use_ep + def is_internal_router(self) -> bool: + # By default, router/gate is called before FusedMoE forward pass + return self._runner.is_internal_router - # XXXXXXXXX keep this separate - @property - def local_num_experts(self) -> int: - """Number of experts assigned to this rank.""" - return self.expert_map_manager.local_num_experts + # + # Expert maps + # @property def expert_placement_strategy(self) -> ExpertPlacementStrategy: - """Expert placement strategy ('linear' or 'round_robin').""" return self.expert_map_manager.placement_strategy @property def expert_global_to_physical(self) -> torch.Tensor | None: - """Routing table: global expert ID to physical expert ID.""" tables = self.expert_map_manager.routing_tables return tables[0] if tables else None @@ -670,23 +558,6 @@ def expert_local_to_global(self) -> torch.Tensor | None: tables = self.expert_map_manager.routing_tables return tables[2] if tables else None - @property - def is_internal_router(self) -> bool: - # By default, router/gate is called before FusedMoE forward pass - return self._runner.is_internal_router - - @property - def is_monolithic(self) -> bool: - return self._runner.routed_experts.quant_method.is_monolithic - - @property - def shared_experts(self) -> SharedExperts | None: - return self._runner.shared_experts - - # - # Expert maps - # - @property def expert_map(self) -> torch.Tensor | None: return self._runner.routed_experts.expert_map @@ -709,24 +580,6 @@ def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: # EPLB # - def _init_aiter_shared_experts_topK_buffer( - self, vllm_config: VllmConfig, dp_size: int - ): - if self.num_fused_shared_experts > 0: - init_aiter_topK_meta_data( - n_routed_experts=self.global_num_experts, - n_shared_experts=self.num_fused_shared_experts, - top_k=self.top_k, - tp_rank=self.ep_rank if self.use_ep else self.tp_rank, - tp_size=self.ep_size if self.use_ep else self.tp_size, - shared_experts_score=1.0, - max_num_tokens=vllm_config.scheduler_config.max_num_batched_tokens - * dp_size, - is_EP=self.use_ep, - ) - # HACK - self.expert_map_manager._local_num_experts += self.num_fused_shared_experts - def get_expert_weights(self) -> Iterable[torch.Tensor]: """Delegate to EPLB manager.""" if self._runner.router.eplb_manager is not None: diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index ef467328db99..7a5f96a5b07b 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -1545,7 +1545,7 @@ def apply_monolithic( hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, - router_logits: torch.Tensor | tuple[torch.Tensor, torch.Tensor], + router_logits: torch.Tensor, activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index a332d01c53fe..709aa44af072 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -91,7 +91,7 @@ def __init__( moe_quant_params = { "num_experts": moe_config.num_local_experts, - "hidden_size": moe_config.hidden_dim, + "hidden_size": self.hidden_size, "unpadded_hidden_size": unpadded_hidden_size, "intermediate_size_per_partition": ( moe_config.intermediate_size_per_partition @@ -281,6 +281,10 @@ def _load_per_channel_weight_scale( ): # for per channel weight quantization if shard_id == "w2": + hidden_dim = self._get_hidden_dim(shard_dim, expert_data.ndim) + expert_data = self._narrow_expert_data_for_padding( + expert_data, loaded_weight, hidden_dim=hidden_dim + ) expert_data.copy_(loaded_weight) elif shard_id in ("w1", "w3"): self._load_w13( @@ -291,6 +295,59 @@ def _load_per_channel_weight_scale( tp_rank=tp_rank, ) + @staticmethod + def _get_hidden_dim(shard_dim: int, ndim: int) -> int: + """Compute the hidden dimension index from the shard (intermediate) + dimension and tensor rank. + + For 2D weight tensors the two data dims are (0, 1). For 3D tensors + with an expert dimension at dim 0, they are (1, 2). ``shard_dim`` + occupies one of these; the hidden dimension is the other. + For 1D tensors (e.g. per-channel scales) returns 0. + """ + if ndim < 2: + return 0 + dim_a = ndim - 2 + dim_b = ndim - 1 + if shard_dim == dim_a: + return dim_b + if shard_dim == dim_b: + return dim_a + raise ValueError( + f"shard_dim={shard_dim} is not a valid data dimension " + f"for a {ndim}D tensor (expected {dim_a} or {dim_b})" + ) + + @staticmethod + def _narrow_expert_data_for_padding( + expert_data: torch.Tensor, + loaded_weight: torch.Tensor, + hidden_dim: int, + ) -> torch.Tensor: + """Narrow expert_data hidden dim to match loaded_weight for padded + hidden_size. + + When backends (e.g., DeepEP) round up hidden_size, weight parameters + are larger than checkpoint weights. Narrow the padded hidden dimension + before copying. + + Args: + expert_data: The (possibly padded) parameter tensor to narrow. + loaded_weight: The checkpoint weight tensor with original size. + hidden_dim: The dimension index corresponding to hidden_size. + Must be non-negative. + """ + if ( + loaded_weight.ndim > 0 + and 0 <= hidden_dim < expert_data.ndim + and hidden_dim < loaded_weight.ndim + and expert_data.shape[hidden_dim] > loaded_weight.shape[hidden_dim] + ): + expert_data = expert_data.narrow( + hidden_dim, 0, loaded_weight.shape[hidden_dim] + ) + return expert_data + def _load_w13( self, expert_data: torch.Tensor, @@ -309,9 +366,17 @@ def _load_w13( # Only narrow if the loaded_weight is not a scalar (0-dim tensor) # and we're not loading the full weight if not load_full and loaded_weight.ndim > 0: - loaded_weight = loaded_weight.narrow( - shard_dim, shard_size * tp_rank, shard_size - ) + # Handle padding: loaded_weight might be smaller than shard_size on last + # TP rank + start_offset = shard_size * tp_rank + available = loaded_weight.shape[shard_dim] - start_offset + if available <= 0: + # If there is no available weight to load for this TP rank + # (can happen on last TP rank with padding), we can skip + # loading and return early + return + narrow_size = min(shard_size, available) + loaded_weight = loaded_weight.narrow(shard_dim, start_offset, narrow_size) # Narrow parameter and load. # w1, gate_proj: Load into first logical weight of w13. if shard_id == "w1": @@ -320,6 +385,10 @@ def _load_w13( else: assert shard_id == "w3" expert_data = expert_data.narrow(shard_dim, shard_size, shard_size) + hidden_dim = self._get_hidden_dim(shard_dim, expert_data.ndim) + expert_data = self._narrow_expert_data_for_padding( + expert_data, loaded_weight, hidden_dim=hidden_dim + ) expert_data.copy_(loaded_weight) def _load_w2( @@ -337,10 +406,22 @@ def _load_w2( # Only narrow if the loaded_weight is not a scalar (0-dim tensor) # and we're not loading the full weight if not load_full and loaded_weight.ndim > 0: - loaded_weight = loaded_weight.narrow( - shard_dim, shard_size * tp_rank, shard_size - ) + # Handle padding: loaded_weight might be smaller than shard_size on last + # TP rank + start_offset = shard_size * tp_rank + available = loaded_weight.shape[shard_dim] - start_offset + if available <= 0: + # If there is no available weight to load for this TP rank + # (can happen on last TP rank with padding), we can skip + # loading and return early + return + narrow_size = min(shard_size, available) + loaded_weight = loaded_weight.narrow(shard_dim, start_offset, narrow_size) # w2, down_proj: Load into only logical weight of w2. + hidden_dim = self._get_hidden_dim(shard_dim, expert_data.ndim) + expert_data = self._narrow_expert_data_for_padding( + expert_data, loaded_weight, hidden_dim=hidden_dim + ) expert_data.copy_(loaded_weight) def _load_single_value( @@ -465,9 +546,25 @@ def weight_loader( expert_data = param.data[expert_id] if shard_id == "w2": + # BnB params are stored as flat packed tensors (e.g. + # (packed_size, 1)), not in the logical weight layout. + # Narrowing packed data for hidden-dim padding is not + # meaningful, so require an exact shape match. + if expert_data.shape != loaded_weight.shape: + raise ValueError( + "BitsAndBytes quantization with padded hidden_size " + "(e.g., from DeepEP) is not supported. " + f"Parameter shape {tuple(expert_data.shape)} != " + f"checkpoint shape {tuple(loaded_weight.shape)}" + ) expert_data.copy_(loaded_weight) elif shard_id in ("w1", "w3"): - # BNB inflight quantization has already sharded the weights + # BnB stores weights as flat packed tensors. _load_w13 is + # still used to split the w1/w3 portions along shard_dim. + # _narrow_expert_data_for_padding will be a no-op since + # packed sizes should already match; if DeepEP padding + # causes a mismatch the copy_() will fail with a clear + # shape error. full_load = True self._load_w13( shard_id=shard_id, @@ -778,7 +875,7 @@ def make_expert_params_mapping( # Execution # - # TODO: split this + # TODO: split/overload this def forward( self, x: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py index 9aa9fe8e0a9a..9cfcb1baa9bb 100644 --- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py @@ -18,7 +18,7 @@ def forward( self, hidden_states: torch.Tensor, router_logits: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: return super().forward( hidden_states=hidden_states, router_logits=router_logits, diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index a53420ebb8f4..8a80f1d1260e 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -6,7 +6,6 @@ import torch import torch.nn.functional as F -from torch.nn import Module from torch.nn.parameter import Parameter import vllm.envs as envs @@ -212,7 +211,7 @@ def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor: def _setup_kernel( self, - layer: Module, + layer: "RoutedExperts", w13: torch.Tensor, w2: torch.Tensor, ) -> None: diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 844624a357e7..d955733d0358 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1949,7 +1949,7 @@ def apply_monolithic( layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: from flashinfer.fused_moe.core import ( ActivationType, Fp8QuantizationType, @@ -2034,7 +2034,7 @@ def apply( topk_weights: torch.Tensor, topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert not self.is_monolithic raise NotImplementedError( "Non-monolithic MXFP8 MoE path is not yet implemented." diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 4ebf30a5260b..c43dd2066303 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -1178,8 +1178,7 @@ def apply_monolithic( layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, - expert_map: torch.Tensor | None = None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: if layer.enable_eplb: raise NotImplementedError( "EPLB not supported for `QuarkW4MXFp4MoEMethod_OSS` yet." @@ -1197,7 +1196,7 @@ def apply_monolithic( topk=layer.top_k, renormalize=layer.renormalize, global_num_experts=layer.global_num_experts, - expert_map=expert_map, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, apply_router_weight_on_input=layer.apply_router_weight_on_input, unpadded_N_w1=self.intermediate_size_per_partition * 2, diff --git a/vllm/model_executor/models/AXK1.py b/vllm/model_executor/models/AXK1.py index f5ed4400fb65..05e5a77fea37 100644 --- a/vllm/model_executor/models/AXK1.py +++ b/vllm/model_executor/models/AXK1.py @@ -170,7 +170,6 @@ def __init__( top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, renormalize=config.norm_topk_prob, quant_config=quant_config, use_grouped_topk=True, diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 221e991dfc29..6174cd9d1bf5 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -283,7 +283,6 @@ def __init__( hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, quant_config=quant_config, - reduce_results=True, prefix=f"{prefix}.experts", ) diff --git a/vllm/model_executor/models/bailing_moe_linear.py b/vllm/model_executor/models/bailing_moe_linear.py index b8bc34325190..3dc4b0898b6d 100644 --- a/vllm/model_executor/models/bailing_moe_linear.py +++ b/vllm/model_executor/models/bailing_moe_linear.py @@ -305,7 +305,7 @@ def __init__( self.hidden_size = config.hidden_size self.quant_config = quant_config self.num_shared_experts = config.num_shared_experts - self.score_function = getattr(config, "score_function", None) + self.score_function: str | None = getattr(config, "score_function", None) self.n_group = getattr(config, "n_group", None) self.topk_group = getattr(config, "topk_group", None) self.use_grouped_topk = self.n_group is not None and self.topk_group is not None @@ -358,7 +358,6 @@ def __init__( top_k=self.top_k, hidden_size=self.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, renormalize=self.norm_expert_prob, quant_config=quant_config, prefix=f"{prefix}.experts", diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index ca6e6a49a98a..a72f4e487164 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -85,7 +85,6 @@ def __init__( hidden_size=config.d_model, intermediate_size=config.ffn_config.ffn_hidden_size, params_dtype=params_dtype, - reduce_results=True, renormalize=True, quant_config=quant_config, tp_size=get_tensor_model_parallel_world_size(), diff --git a/vllm/model_executor/models/sarvam.py b/vllm/model_executor/models/sarvam.py index 5544e0fe4cbd..8940dd611179 100644 --- a/vllm/model_executor/models/sarvam.py +++ b/vllm/model_executor/models/sarvam.py @@ -341,7 +341,6 @@ def __init__( top_k=self.top_k, hidden_size=self.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, renormalize=self.norm_expert_prob, quant_config=quant_config, prefix=f"{prefix}.experts", From 0bf4b9d74596efcba8d9a73e35d16b007bac2d0d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 6 Apr 2026 23:58:58 +0000 Subject: [PATCH 097/191] fixes Signed-off-by: Bill Nell --- .../moe/modular_kernel_tools/common.py | 2 +- tests/kernels/moe/test_cutlass_moe.py | 4 +- tests/kernels/moe/test_flashinfer.py | 6 +- tests/kernels/moe/test_flashinfer_moe.py | 3 +- tests/kernels/moe/test_moe.py | 3 +- tests/kernels/moe/test_moe_layer.py | 2 +- tests/kernels/moe/utils.py | 4 +- .../model_executor/layers/fused_moe/config.py | 12 +- .../layers/fused_moe/eplb_manager.py | 49 +-- .../layers/fused_moe/expert_map_manager.py | 51 +++ vllm/model_executor/layers/fused_moe/layer.py | 329 +++++++----------- .../layers/fused_moe/routed_experts.py | 107 +++++- .../fused_moe/runner/chunking_moe_runner.py | 2 +- .../fused_moe/runner/moe_runner_base.py | 18 +- vllm/model_executor/models/nemotron_h.py | 1 - 15 files changed, 320 insertions(+), 273 deletions(-) diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py index 47d5ef6a07f5..40a57369d05f 100644 --- a/tests/kernels/moe/modular_kernel_tools/common.py +++ b/tests/kernels/moe/modular_kernel_tools/common.py @@ -574,7 +574,7 @@ def next_power_of_2(x): num_experts=config.E, experts_per_token=config.topk, hidden_dim=config.K, - intermediate_size_per_partition=config.N, + intermediate_size=config.N, num_local_experts=config.num_local_experts, num_logical_experts=config.E, moe_parallel_config=moe_parallel_config, diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index e06672f41d0c..fe6f90b0e7d6 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -200,7 +200,7 @@ def slice_experts(): moe_config = make_dummy_moe_config( num_experts=w2.shape[0], hidden_dim=w2.shape[1], - intermediate_size_per_partition=w2.shape[2], + intermediate_size=w2.shape[2], in_dtype=a.dtype, ) kernel = mk.FusedMoEKernel( @@ -270,7 +270,7 @@ def run_8_bit( moe_config = make_dummy_moe_config( num_experts=moe_tensors.w2_q.shape[0], # type: ignore[union-attr] hidden_dim=moe_tensors.w2_q.shape[1], # type: ignore[union-attr] - intermediate_size_per_partition=moe_tensors.w2_q.shape[2], # type: ignore[union-attr] + intermediate_size=moe_tensors.w2_q.shape[2], # type: ignore[union-attr] in_dtype=moe_tensors.a.dtype, ) kernel = mk.FusedMoEKernel( diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py index db499b68843f..82d3ef8c7fab 100644 --- a/tests/kernels/moe/test_flashinfer.py +++ b/tests/kernels/moe/test_flashinfer.py @@ -165,12 +165,11 @@ def make_moe_tensors_8bit( num_experts=e, experts_per_token=topk, hidden_dim=k, - intermediate_size_per_partition=n, + intermediate_size=n, num_local_experts=e, num_logical_experts=e, moe_parallel_config=layer.moe_parallel_config, in_dtype=hidden_states.dtype, - is_act_and_mul=is_gated, routing_method=layer.routing_method_type, activation=activation, device=w13_quantized.device, @@ -339,14 +338,13 @@ def get_fused_moe_quant_config(n: torch.nn.Module) -> FusedMoEQuantConfig: num_experts=e, experts_per_token=topk, hidden_dim=k, - intermediate_size_per_partition=n, + intermediate_size=n, num_local_experts=e, num_logical_experts=e, activation=activation, device="cuda", moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), in_dtype=torch.bfloat16, - is_act_and_mul=activation.is_gated, routing_method=RoutingMethodType.TopK, ) diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py index a3fb474f1517..9295cbc5f1da 100644 --- a/tests/kernels/moe/test_flashinfer_moe.py +++ b/tests/kernels/moe/test_flashinfer_moe.py @@ -96,14 +96,13 @@ def test_flashinfer_fp4_moe_no_graph( num_experts=e, experts_per_token=topk, hidden_dim=k, - intermediate_size_per_partition=n, + intermediate_size=n, num_local_experts=e, num_logical_experts=e, activation=activation, device="cuda", moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), in_dtype=dtype, - is_act_and_mul=is_gated_act, routing_method=RoutingMethodType.TopK, ) diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 28be9f23d661..55430fba6598 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -1661,14 +1661,13 @@ def test_unquantized_bf16_flashinfer_trtllm_backend( num_experts=e, experts_per_token=topk, hidden_dim=k, - intermediate_size_per_partition=n, + intermediate_size=n, num_local_experts=e, num_logical_experts=e, activation="silu", device="cuda", moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), in_dtype=dtype, - is_act_and_mul=True, routing_method=RoutingMethodType.Renormalize, max_num_tokens=m, ) diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index 193f72e2f5ab..9e1d07950b4c 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -857,7 +857,7 @@ def make_fused_moe_layer( topk_group=topk_group, quant_config=quant_config, tp_size=tp_size, - ep_size=ep_size, + # ep_size=ep_size, dp_size=dp_size, pcp_size=pcp_size, prefix="from_forward_context", diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index 8763ad683517..1452b9bbc03e 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -47,7 +47,7 @@ def make_dummy_moe_config( num_experts: int = 1, experts_per_token: int = 1, hidden_dim: int = 1, - intermediate_size_per_partition: int = 1, + intermediate_size: int = 1, in_dtype: torch.dtype = torch.bfloat16, ) -> FusedMoEConfig: """ @@ -61,7 +61,7 @@ def make_dummy_moe_config( num_experts=num_experts, experts_per_token=experts_per_token, hidden_dim=hidden_dim, - intermediate_size_per_partition=intermediate_size_per_partition, + intermediate_size=intermediate_size, num_local_experts=num_experts, num_logical_experts=num_experts, moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index c730a9e578c3..c0ee29f37f4d 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -1164,7 +1164,7 @@ class FusedMoEConfig: num_experts: int experts_per_token: int hidden_dim: int - intermediate_size_per_partition: int + intermediate_size: int num_local_experts: int num_logical_experts: int activation: MoEActivation @@ -1181,7 +1181,6 @@ class FusedMoEConfig: moe_backend: str = "auto" max_num_tokens: int = envs.VLLM_MOE_DP_CHUNK_SIZE has_bias: bool = False - is_act_and_mul: bool = True is_lora_enabled: bool = False # This flag is used to disable the inplace optimization @@ -1191,12 +1190,17 @@ class FusedMoEConfig: disable_inplace: bool = True # Set by __post_init__ + intermediate_size_per_partition: int = -1 rocm_aiter_fmoe_enabled: bool = False aiter_fmoe_shared_expert_enabled: bool = False def __post_init__(self): from vllm._aiter_ops import rocm_aiter_ops + tp_size = self.moe_parallel_config.tp_size + assert self.intermediate_size % tp_size == 0 + self.intermediate_size_per_partition = self.intermediate_size // tp_size + if self.dp_size > 1: logger.debug_once( "Using FusedMoEConfig::max_num_tokens=%d", self.max_num_tokens @@ -1227,6 +1231,10 @@ def __post_init__(self): "is_act_and_mul=False is supported only for CUDA and ROCm for now" ) + @property + def is_act_and_mul(self) -> bool: + return self.activation.is_gated + @property def tp_size(self): return self.moe_parallel_config.tp_size diff --git a/vllm/model_executor/layers/fused_moe/eplb_manager.py b/vllm/model_executor/layers/fused_moe/eplb_manager.py index 8b046073f1e4..ff22cc84e68f 100644 --- a/vllm/model_executor/layers/fused_moe/eplb_manager.py +++ b/vllm/model_executor/layers/fused_moe/eplb_manager.py @@ -29,6 +29,8 @@ class EplbManager: def __init__( self, + ep_size: int, + global_num_experts: int, num_redundant_experts: int = 0, ): self.num_redundant_experts = num_redundant_experts @@ -36,6 +38,14 @@ def __init__( # Runtime EPLB state self.state = EplbLayerState() + # Validate EPLB configuration + # EPLB currently only supports even distribution of experts across ranks + assert global_num_experts % ep_size == 0, ( + f"EPLB currently only supports even distribution of " + f"experts across ranks. Got {global_num_experts} experts " + f"and {ep_size} EP ranks." + ) + def set_state( self, moe_layer_idx: int, @@ -131,13 +141,7 @@ def _maybe_make_contiguous( assert all( weight.is_contiguous() for name, weight in weights - if not ( - name.startswith("_runner._shared_experts._layer") - or name.startswith("routed_experts.shared_experts._layer") - or name.startswith("_runner.gate.") - or name.startswith("_runner.routed_input_transform.") - or name.startswith("_runner.routed_output_transform.") - ) + if not name.startswith("shared_experts._layer") and name not in NON_EXPERT_WEIGHTS ) @@ -146,34 +150,5 @@ def _maybe_make_contiguous( for name, weight in weights if name not in NON_EXPERT_WEIGHTS and weight.shape != torch.Size([]) - and not name.startswith("_runner._shared_experts._layer") - and not name.startswith("routed_experts.shared_experts._layer") - # exclude parameters from non-expert submodules, - # e.g. gate/shared/transforms. - and not name.startswith("_runner.gate.") - and not name.startswith("_runner.routed_input_transform.") - and not name.startswith("_runner.routed_output_transform.") + and not name.startswith("shared_experts._layer") ] - - @staticmethod - def validate_configuration( - global_num_experts: int, - ep_size: int, - ) -> None: - """ - Validate EPLB configuration. - - Args: - global_num_experts: Total number of experts (including redundant) - ep_size: Expert parallelism size - - Raises: - AssertionError: If configuration is invalid - """ - - # EPLB currently only supports even distribution of experts across ranks - assert global_num_experts % ep_size == 0, ( - f"EPLB currently only supports even distribution of " - f"experts across ranks. Got {global_num_experts} experts " - f"and {ep_size} EP ranks." - ) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 0e5c3e7a2c99..5364f4163102 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -112,6 +112,43 @@ def determine_expert_map( return (local_num_experts, expert_map, expert_mask) +def determine_expert_placement_strategy( + expert_placement_strategy: ExpertPlacementStrategy, + moe_parallel_config: FusedMoEParallelConfig, + num_expert_group: int | None, + num_redundant_experts: int, + enable_eplb: bool, +) -> ExpertPlacementStrategy: + if expert_placement_strategy == "round_robin": + round_robin_supported = ( + (num_expert_group is not None and num_expert_group > 1) + and num_redundant_experts == 0 + and not enable_eplb + ) + + if not round_robin_supported: + logger.warning( + "Round-robin expert placement is only supported for " + "models with multiple expert groups and no redundant " + "experts. Falling back to linear expert placement." + ) + return "linear" + if ( + moe_parallel_config.use_all2all_kernels + and not moe_parallel_config.use_deepep_ll_kernels + and not moe_parallel_config.use_nixl_ep_kernels + ): + logger.warning( + "Round-robin expert placement currently only supports " + "the DeepEP low-latency or NIXL EP backend, but '%s' was configured. " + "Falling back to linear expert placement.", + moe_parallel_config.all2all_backend, + ) + return "linear" + + return expert_placement_strategy + + class ExpertMapManager: """ Manages expert ID mappings and placement for Expert Parallelism. @@ -130,8 +167,11 @@ def __init__( top_k: int, global_num_experts: int, logical_num_experts: int, + num_redundant_experts: int, + num_expert_group: int | None, moe_parallel_config: FusedMoEParallelConfig, placement_strategy: ExpertPlacementStrategy, + enable_eplb: bool, num_fused_shared_experts: int = 0, rocm_aiter_enabled: bool = False, device: torch.device | None = None, @@ -156,6 +196,17 @@ def __init__( self.rocm_aiter_enabled = rocm_aiter_enabled self.device = device + if moe_parallel_config.use_ep: + # Determine expert placement strategy before creating manager + # TODO move into EMM + placement_strategy = determine_expert_placement_strategy( + expert_placement_strategy=placement_strategy, + moe_parallel_config=moe_parallel_config, + num_expert_group=num_expert_group, + num_redundant_experts=num_redundant_experts, + enable_eplb=enable_eplb, + ) + # Determine effective placement strategy self._placement_strategy = self._determine_placement_strategy( placement_strategy diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index ac3b792570ba..31d6e172c541 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -7,7 +7,7 @@ import vllm.envs as envs from vllm._aiter_ops import rocm_aiter_ops -from vllm.config import VllmConfig, get_current_vllm_config +from vllm.config import ParallelConfig, VllmConfig, get_current_vllm_config from vllm.config.parallel import ExpertPlacementStrategy from vllm.distributed import ( get_dp_group, @@ -51,80 +51,6 @@ logger = init_logger(__name__) -# TODO: merge with class method -def determine_expert_placement_strategy( - expert_placement_strategy: ExpertPlacementStrategy, - moe_parallel_config: FusedMoEParallelConfig, - num_expert_group: int | None, - num_redundant_experts: int, - enable_eplb: bool, -) -> ExpertPlacementStrategy: - if expert_placement_strategy == "round_robin": - round_robin_supported = ( - (num_expert_group is not None and num_expert_group > 1) - and num_redundant_experts == 0 - and not enable_eplb - ) - - if not round_robin_supported: - logger.warning( - "Round-robin expert placement is only supported for " - "models with multiple expert groups and no redundant " - "experts. Falling back to linear expert placement." - ) - return "linear" - if ( - moe_parallel_config.use_all2all_kernels - and not moe_parallel_config.use_deepep_ll_kernels - and not moe_parallel_config.use_nixl_ep_kernels - ): - logger.warning( - "Round-robin expert placement currently only supports " - "the DeepEP low-latency or NIXL EP backend, but '%s' was configured. " - "Falling back to linear expert placement.", - moe_parallel_config.all2all_backend, - ) - return "linear" - - return expert_placement_strategy - - -# TODO(rob): move this down to the kernel. -def maybe_roundup_hidden_size( - hidden_size: int, - act_dtype: torch.dtype, - moe_parallel_config: FusedMoEParallelConfig, - is_lora_enabled: bool, - model_type: str | None, -) -> int: - """ - Given layer hidden size and MoE configurations, round up hidden_size - if necessary. - - Args: - hidden_size: Layer hidden-size - act_dtype: Data type of the layer activations. - moe_parallel_config: Fused MoE parallelization strategy configuration. - is_lora_enabled: True if the engine is enabled with LoRA. This - is used in the case of mxfp4 quantization in selecting the - MxFP4Backend. - model_type: for checking if gpt-oss - - Return: - Rounded up hidden_size if rounding up is required based on the configs. - Original hidden size otherwise. - """ - from vllm.model_executor.layers.fused_moe.all2all_utils import ( - maybe_roundup_layer_hidden_size, - ) - - hidden_size = maybe_roundup_layer_hidden_size( - hidden_size, act_dtype, moe_parallel_config - ) - - return hidden_size - - def register_layer_for_moe_forward_op( vllm_config: VllmConfig, layer: torch.nn.Module, # FusedMoE for now @@ -138,6 +64,67 @@ def register_layer_for_moe_forward_op( compilation_config.static_all_moe_layers.append(prefix) +def make_parallel_config( + tp_size: int | None, + dp_size: int | None, + pcp_size: int | None, + is_sequence_parallel: bool, + parallel_config: ParallelConfig, +) -> FusedMoEParallelConfig: + tp_size_ = ( + tp_size if tp_size is not None else get_tensor_model_parallel_world_size() + ) + dp_size_ = dp_size if dp_size is not None else get_dp_group().world_size + pcp_size_ = pcp_size if pcp_size is not None else get_pcp_group().world_size + + is_sequence_parallel = is_sequence_parallel + sp_size = tp_size_ if is_sequence_parallel else 1 + + moe_parallel_config = FusedMoEParallelConfig.make( + tp_size_=tp_size_, + pcp_size_=pcp_size_, + dp_size_=dp_size_, + sp_size_=sp_size, + vllm_parallel_config=parallel_config, + ) + + assert moe_parallel_config.is_sequence_parallel == is_sequence_parallel + + logger.debug("FusedMoEParallelConfig = %s", str(moe_parallel_config)) + + return moe_parallel_config + + +def determine_expert_counts( + num_experts: int, + num_redundant_experts: int, + n_shared_experts: int | None, + is_act_and_mul: bool, +) -> tuple[int, int, int]: + global_num_experts = num_experts + num_redundant_experts + logical_num_experts = num_experts + # ROCm aiter shared experts fusion + # AITER only supports gated activations (silu/gelu), so disable it + # for non-gated MoE (is_act_and_mul=False) + # rocm_aiter_fmoe_enabled = rocm_aiter_ops.is_fused_moe_enabled() and is_act_and_mul + aiter_fmoe_shared_expert_enabled = ( + rocm_aiter_ops.is_fusion_moe_shared_experts_enabled() and is_act_and_mul + ) + + num_fused_shared_experts = ( + n_shared_experts + if n_shared_experts is not None and aiter_fmoe_shared_expert_enabled + else 0 + ) + if not aiter_fmoe_shared_expert_enabled and num_fused_shared_experts != 0: + raise ValueError( + "n_shared_experts is only supported on ROCm aiter when " + "VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS is enabled" + ) + + return global_num_experts, logical_num_experts, num_fused_shared_experts + + # --8<-- [start:fused_moe] @CustomOp.register("fused_moe") class FusedMoE(CustomOp): @@ -177,7 +164,6 @@ def __init__( topk_group: int | None = None, quant_config: QuantizationConfig | None = None, tp_size: int | None = None, - ep_size: int | None = None, dp_size: int | None = None, pcp_size: int | None = None, prefix: str = "", @@ -187,7 +173,6 @@ def __init__( e_score_correction_bias: torch.Tensor | None = None, apply_router_weight_on_input: bool = False, activation: str = "silu", - is_act_and_mul: bool = True, enable_eplb: bool = False, num_redundant_experts: int = 0, has_bias: bool = False, @@ -204,99 +189,44 @@ def __init__( ): super().__init__() + vllm_config = get_current_vllm_config() + # IMPORTANT: RoutedExperts must have same layer_name/prefix as FusedMoE for now # This is still needed self.layer_name = prefix - if params_dtype is None: - params_dtype = torch.get_default_dtype() - - vllm_config = get_current_vllm_config() - - # FIXME (varun): We should have a better way of inferring the activation - # datatype. This works for now as the tensor datatype entering the MoE - # operation is typically unquantized (i.e. float16/bfloat16). - if vllm_config.model_config is not None: - moe_in_dtype = vllm_config.model_config.dtype - else: - # TODO (bnell): This is a hack to get test_mixtral_moe to work - # since model_config is not set in the pytest test. - moe_in_dtype = params_dtype - - tp_size_ = ( - tp_size if tp_size is not None else get_tensor_model_parallel_world_size() - ) - dp_size_ = dp_size if dp_size is not None else get_dp_group().world_size - pcp_size_ = pcp_size if pcp_size is not None else get_pcp_group().world_size - - is_sequence_parallel = is_sequence_parallel - sp_size = tp_size_ if is_sequence_parallel else 1 - - moe_parallel_config = FusedMoEParallelConfig.make( - tp_size_=tp_size_, - pcp_size_=pcp_size_, - dp_size_=dp_size_, - sp_size_=sp_size, - vllm_parallel_config=vllm_config.parallel_config, + moe_activation = MoEActivation.from_str(activation) + is_act_and_mul = moe_activation.is_gated + + moe_parallel_config = make_parallel_config( + tp_size=tp_size, + dp_size=dp_size, + pcp_size=pcp_size, + is_sequence_parallel=is_sequence_parallel, + parallel_config=vllm_config.parallel_config, ) - assert moe_parallel_config.is_sequence_parallel == is_sequence_parallel - - logger.debug("FusedMoEParallelConfig = %s", str(moe_parallel_config)) - - global_num_experts = num_experts + num_redundant_experts - logical_num_experts = num_experts + global_num_experts, logical_num_experts, num_fused_shared_experts = ( + determine_expert_counts( + num_experts, + num_redundant_experts, + n_shared_experts, + is_act_and_mul, + ) + ) # Initialize EPLB manager (or None?) eplb_manager: EplbManager | None = None if enable_eplb: - eplb_manager = EplbManager(num_redundant_experts=num_redundant_experts) - - expert_placement_strategy: ExpertPlacementStrategy = ( - vllm_config.parallel_config.expert_placement_strategy - ) - - # ROCm aiter shared experts fusion - # AITER only supports gated activations (silu/gelu), so disable it - # for non-gated MoE (is_act_and_mul=False) - rocm_aiter_fmoe_enabled = ( - rocm_aiter_ops.is_fused_moe_enabled() and is_act_and_mul - ) - aiter_fmoe_shared_expert_enabled = ( - rocm_aiter_ops.is_fusion_moe_shared_experts_enabled() and is_act_and_mul - ) - - num_fused_shared_experts = ( - n_shared_experts - if n_shared_experts is not None and aiter_fmoe_shared_expert_enabled - else 0 - ) - if not aiter_fmoe_shared_expert_enabled and num_fused_shared_experts != 0: - raise ValueError( - "n_shared_experts is only supported on ROCm aiter when " - "VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS is enabled" - ) - - # Determine expert maps - if moe_parallel_config.use_ep: - if eplb_manager is not None: - # Validate EPLB configuration - eplb_manager.validate_configuration(global_num_experts, ep_size) - else: - assert num_redundant_experts == 0, ( - "Redundant experts are only supported with EPLB." - ) - - # Determine expert placement strategy before creating manager - expert_placement_strategy_effective = determine_expert_placement_strategy( - expert_placement_strategy=expert_placement_strategy, - moe_parallel_config=moe_parallel_config, - num_expert_group=num_expert_group, + eplb_manager = EplbManager( + ep_size=moe_parallel_config.ep_size, + global_num_experts=global_num_experts, num_redundant_experts=num_redundant_experts, - enable_eplb=eplb_manager is not None, ) else: - expert_placement_strategy_effective = expert_placement_strategy + assert num_redundant_experts == 0, ( + "Redundant experts are only supported with EPLB." + ) # Create expert map manager self.expert_map_manager = ExpertMapManager( @@ -304,17 +234,16 @@ def __init__( top_k=top_k, global_num_experts=global_num_experts, logical_num_experts=logical_num_experts, + num_redundant_experts=num_redundant_experts, + num_expert_group=num_expert_group, moe_parallel_config=moe_parallel_config, - placement_strategy=expert_placement_strategy_effective, + placement_strategy=vllm_config.parallel_config.expert_placement_strategy, + enable_eplb=eplb_manager is not None, num_fused_shared_experts=num_fused_shared_experts, - rocm_aiter_enabled=rocm_aiter_fmoe_enabled, + rocm_aiter_enabled=rocm_aiter_ops.is_fused_moe_enabled() and is_act_and_mul, device=vllm_config.device_config.device, ) - tp_size = moe_parallel_config.tp_size - assert intermediate_size % tp_size == 0 - intermediate_size_per_partition = intermediate_size // tp_size - self._runner: MoERunner # TODO(bnell): we should not have to create a router if the kernel is @@ -334,42 +263,42 @@ def __init__( e_score_correction_bias=e_score_correction_bias, num_fused_shared_experts=num_fused_shared_experts, eplb_manager=eplb_manager, + zero_expert_type=zero_expert_type, + num_logical_experts=logical_num_experts, # TODO(bnell): once we can construct the MK at init time, we # can make this a value. + # THIS IS BAD indices_type_getter=lambda: self._runner.routed_experts.quant_method.topk_indices_dtype, # noqa: E501 - zero_expert_type=zero_expert_type, - num_logical_experts=logical_num_experts, ) + # TODO: move this??????????? is this even needed??? # When using zero experts, slice e_score_correction_bias to cover # only real experts, for compatibility with monolithic kernels that # read it directly. - if zero_expert_type is not None and e_score_correction_bias is not None: + if ( + False + and zero_expert_type is not None + and e_score_correction_bias is not None + ): self.e_score_correction_bias = e_score_correction_bias[logical_num_experts] - # Round up hidden size before creating moe_config. - # This way moe_config is created with the correct hidden_size from the start. - unpadded_hidden_size = hidden_size - model_type = ( - vllm_config.model_config.hf_config.model_type - if vllm_config.model_config is not None - else None - ) - hidden_size = maybe_roundup_hidden_size( - hidden_size=hidden_size, - act_dtype=moe_in_dtype, - moe_parallel_config=moe_parallel_config, - is_lora_enabled=vllm_config.lora_config is not None, - model_type=model_type, - ) - - moe_activation = MoEActivation.from_str(activation) + # FIXME (varun): We should have a better way of inferring the activation + # datatype. This works for now as the tensor datatype entering the MoE + # operation is typically unquantized (i.e. float16/bfloat16). + if vllm_config.model_config is not None: + moe_in_dtype = vllm_config.model_config.dtype + elif params_dtype is not None: + # TODO (bnell): This is a hack to get test_mixtral_moe to work + # since model_config is not set in the pytest test. + moe_in_dtype = params_dtype + else: + moe_in_dtype = torch.get_default_dtype() moe_config = FusedMoEConfig( num_experts=global_num_experts, experts_per_token=top_k, hidden_dim=hidden_size, - intermediate_size_per_partition=intermediate_size_per_partition, + intermediate_size=intermediate_size, num_local_experts=self.expert_map_manager.local_num_experts, num_logical_experts=logical_num_experts, moe_parallel_config=moe_parallel_config, @@ -378,7 +307,6 @@ def __init__( router_logits_dtype=router_logits_dtype, max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE, has_bias=has_bias, - is_act_and_mul=is_act_and_mul, is_lora_enabled=vllm_config.lora_config is not None, activation=moe_activation, device=vllm_config.device_config.device, @@ -387,8 +315,6 @@ def __init__( disable_inplace=disable_inplace() or shared_experts is not None, ) - quant_config = quant_config - logger.debug("FusedMoEConfig = %s", moe_config) # Create RoutedExperts instance BEFORE create_weights() @@ -396,13 +322,12 @@ def __init__( routed_experts = RoutedExperts( self.layer_name, params_dtype, - unpadded_hidden_size, + hidden_size, intermediate_size, moe_config, quant_config, expert_map_manager=self.expert_map_manager, # Extra params that are needed by quant_methods, pass along for now - rocm_aiter_fmoe_enabled=rocm_aiter_fmoe_enabled, # get from moe config top_k=top_k, use_grouped_topk=use_grouped_topk, num_expert_group=num_expert_group, @@ -410,7 +335,8 @@ def __init__( custom_routing_function=custom_routing_function, scoring_func=scoring_func, routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, # get from router? + # TODO get from router? needs to be truncated? + e_score_correction_bias=e_score_correction_bias, apply_router_weight_on_input=apply_router_weight_on_input, activation=moe_activation, ) @@ -419,20 +345,6 @@ def __init__( # ditch this eventually. self.routed_experts = routed_experts - # Where to move this? - quant_method = routed_experts.quant_method - if enable_eplb and not quant_method.supports_eplb: - # TODO: Add support for additional quantization methods. - # The implementation for other quantization methods does not - # contain essential differences, but the current quant API - # design causes duplicated work when extending to new - # quantization methods, so I'm leaving it for now. - # If you plan to add support for more quantization methods, - # please refer to the implementation in `Fp8MoEMethod`. - raise NotImplementedError( - f"EPLB is not supported {quant_method.__class__.__name__}." - ) - # Storing the runner in the FusedMoE is an intermediate state, eventually # the runner will own the FusedMoE layer and provide the execution interface # for MoE ops. @@ -451,6 +363,7 @@ def __init__( ) # HACK XXXXXXXXXXXXXXXXXXXXXXXX + # This is needed by various _setup_kernels in quant methods. routed_experts.shared_experts = self._runner.shared_experts # For smuggling this layer into the fused moe custom op @@ -583,7 +496,9 @@ def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: def get_expert_weights(self) -> Iterable[torch.Tensor]: """Delegate to EPLB manager.""" if self._runner.router.eplb_manager is not None: - return self._runner.router.eplb_manager.get_expert_weights(self) + return self._runner.router.eplb_manager.get_expert_weights( + self.routed_experts + ) else: return [] @@ -608,6 +523,10 @@ def set_eplb_state( logical_replica_count, ) + # + # Weight loading + # + @classmethod def make_expert_params_mapping( cls, diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index 709aa44af072..b915272db2ed 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -16,6 +16,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, + FusedMoEParallelConfig, ) from vllm.model_executor.layers.fused_moe.expert_map_manager import ( ExpertMapManager, @@ -33,6 +34,42 @@ logger = init_logger(__name__) +# TODO(rob): move this down to the kernel. +def maybe_roundup_hidden_size( + hidden_size: int, + act_dtype: torch.dtype, + moe_parallel_config: FusedMoEParallelConfig, + is_lora_enabled: bool, + model_type: str | None, +) -> int: + """ + Given layer hidden size and MoE configurations, round up hidden_size + if necessary. + + Args: + hidden_size: Layer hidden-size + act_dtype: Data type of the layer activations. + moe_parallel_config: Fused MoE parallelization strategy configuration. + is_lora_enabled: True if the engine is enabled with LoRA. This + is used in the case of mxfp4 quantization in selecting the + MxFP4Backend. + model_type: for checking if gpt-oss + + Return: + Rounded up hidden_size if rounding up is required based on the configs. + Original hidden size otherwise. + """ + from vllm.model_executor.layers.fused_moe.all2all_utils import ( + maybe_roundup_layer_hidden_size, + ) + + hidden_size = maybe_roundup_layer_hidden_size( + hidden_size, act_dtype, moe_parallel_config + ) + + return hidden_size + + class FusedMoeWeightScaleSupported(Enum): TENSOR = "tensor" CHANNEL = "channel" @@ -53,7 +90,7 @@ class RoutedExperts(torch.nn.Module): def __init__( self, layer_name: str, - params_dtype: torch.dtype, + params_dtype: torch.dtype | None, unpadded_hidden_size: int, # put in moe_config? intermediate_size: int, moe_config: FusedMoEConfig, @@ -67,9 +104,6 @@ def __init__( self.quant_config = quant_config self.expert_map_manager = expert_map_manager self.hidden_size = moe_config.hidden_dim - self.intermediate_size_per_partition = ( - moe_config.intermediate_size_per_partition - ) self.global_num_experts = moe_config.num_experts self.local_num_experts = moe_config.num_local_experts @@ -80,6 +114,8 @@ def __init__( if self.expert_map_manager.expert_mask is not None: self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) + self.rocm_aiter_fmoe_enabled = moe_config.rocm_aiter_fmoe_enabled + # Bit of hack until things are settled self.__dict__.update(kwargs) @@ -89,6 +125,69 @@ def __init__( self.moe_config, ) + # + # TODO: this will be replaced by method on quant_method. + # + + vllm_config = get_current_vllm_config() + + # Round up hidden size before creating moe_config. + # This way moe_config is created with the correct hidden_size from the start. + unpadded_hidden_size = self.moe_config.hidden_dim + model_type = ( + vllm_config.model_config.hf_config.model_type + if vllm_config.model_config is not None + else None + ) + + # FIXME (varun): We should have a better way of inferring the activation + # datatype. This works for now as the tensor datatype entering the MoE + # operation is typically unquantized (i.e. float16/bfloat16). + if vllm_config.model_config is not None: + moe_in_dtype = vllm_config.model_config.dtype + elif params_dtype is not None: + # TODO (bnell): This is a hack to get test_mixtral_moe to work + # since model_config is not set in the pytest test. + moe_in_dtype = params_dtype + else: + params_dtype = torch.get_default_dtype() + moe_in_dtype = params_dtype + + hidden_size = maybe_roundup_hidden_size( + hidden_size=self.moe_config.hidden_dim, + act_dtype=moe_in_dtype, + moe_parallel_config=self.moe_config.moe_parallel_config, + is_lora_enabled=vllm_config.lora_config is not None, + model_type=model_type, + ) + + self.moe_config.hidden_dim = hidden_size + # self.moe_config.intermediate_size_per_partition = ( + # intermediate_size_per_partition + # ) + self.intermediate_size_per_partition = ( + moe_config.intermediate_size_per_partition + ) + + # + # END TODO: this will be replaced by method on quant_method. + # + + if ( + self.moe_config.moe_parallel_config.enable_eplb + and not self.quant_method.supports_eplb + ): + # TODO: Add support for additional quantization methods. + # The implementation for other quantization methods does not + # contain essential differences, but the current quant API + # design causes duplicated work when extending to new + # quantization methods, so I'm leaving it for now. + # If you plan to add support for more quantization methods, + # please refer to the implementation in `Fp8MoEMethod`. + raise NotImplementedError( + f"EPLB is not supported {self.quant_method.__class__.__name__}." + ) + moe_quant_params = { "num_experts": moe_config.num_local_experts, "hidden_size": self.hidden_size, diff --git a/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py index c0adeddde284..f66db463bd52 100644 --- a/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py @@ -65,7 +65,7 @@ def __init__(self, inner: MoERunnerBase, **kwargs): def _replace_quant_method(self, quant_method: FusedMoEMethodBase): self.routed_experts.quant_method = quant_method self._inner._replace_quant_method(quant_method) - assert self._shared_experts == self._inner._shared_experts + assert self.shared_experts == self._inner.shared_experts def _init_dp_chunking(self) -> list[torch.Tensor]: states_shape: tuple[int, ...] diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index abf45a51bcff..e2c6227858a5 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -240,8 +240,8 @@ def shared_experts(self) -> SharedExperts | None: # TODO(bnell): Temporary hack. Get rid of this. def _replace_quant_method(self, quant_method: FusedMoEMethodBase): self.routed_experts.quant_method = quant_method - if self._shared_experts is not None: - self._shared_experts._mk_owns_shared_expert = ( + if self.shared_experts is not None: + self.shared_experts._mk_owns_shared_expert = ( quant_method.mk_owns_shared_expert ) @@ -250,11 +250,11 @@ def _select_forward(self) -> Callable: # TODO: Once the OOM issue for the TPU backend is resolved, we # will switch to using the moe_forward custom op. # Note: CPU doesn't require wrapped _forward_impl. - return _moe_forward if self._shared_experts is None else _moe_forward_shared + return _moe_forward if self.shared_experts is None else _moe_forward_shared return ( torch.ops.vllm.moe_forward - if self._shared_experts is None + if self.shared_experts is None else torch.ops.vllm.moe_forward_shared ) @@ -280,7 +280,7 @@ def apply_routed_input_transform( return ( hidden_states, - hidden_states if self._shared_experts is not None else None, + hidden_states if self.shared_experts is not None else None, ) def apply_routed_output_transform( @@ -332,7 +332,7 @@ def _must_reduce_shared_expert_output(self) -> bool: early. """ return ( - self._shared_experts is not None + self.shared_experts is not None and self.routed_experts.quant_method.moe_kernel is not None and self.routed_experts.quant_method.moe_kernel.output_is_reduced() ) @@ -441,9 +441,9 @@ def _maybe_apply_shared_experts( model's overlap strategy. Only fires if shared experts are configured and the order matches the shared experts' configured execution point. """ - if self._shared_experts is not None: + if self.shared_experts is not None: assert shared_experts_input is not None - self._shared_experts(shared_experts_input, order) + self.shared_experts(shared_experts_input, order) def _apply_quant_method( self, @@ -489,7 +489,7 @@ def _apply_quant_method( ) return ( - self._shared_experts.output if self._shared_experts is not None else None, + self.shared_experts.output if self.shared_experts is not None else None, fused_out, ) diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 8e313226120a..b866f2bb5766 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -225,7 +225,6 @@ def __init__( scoring_func="sigmoid", e_score_correction_bias=self.gate.e_score_correction_bias, activation=activation_without_mul(config.mlp_hidden_act), - is_act_and_mul=False, # non-gated MoE enable_eplb=self.enable_eplb, num_redundant_experts=self.n_redundant_experts, is_sequence_parallel=self.is_sequence_parallel, From 3fcc8167182fe1504b6b788f877dd2b4f7dc6365 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 10 Apr 2026 20:00:00 +0000 Subject: [PATCH 098/191] remove FusedMoE/SharedFusedMoE classes Signed-off-by: Bill Nell --- tests/kernels/moe/test_moe_layer.py | 4 +- .../base_device_communicator.py | 13 +- .../distributed/elastic_ep/elastic_execute.py | 6 +- vllm/lora/layers/fused_moe.py | 1 + vllm/lora/model_manager.py | 1 + vllm/lora/utils.py | 1 + .../layers/fused_moe/__init__.py | 24 +- vllm/model_executor/layers/fused_moe/layer.py | 624 ++++++------------ .../layers/fused_moe/router/base_router.py | 25 +- .../fused_moe/router/custom_routing_router.py | 2 - .../fused_moe/router/fused_moe_router.py | 1 + .../router/fused_topk_bias_router.py | 3 - .../fused_moe/router/fused_topk_router.py | 2 - .../fused_moe/router/grouped_topk_router.py | 3 - .../layers/fused_moe/router/router_factory.py | 8 - .../router/routing_simulator_router.py | 3 - .../fused_moe/router/zero_expert_router.py | 3 - .../layers/fused_moe/runner/moe_runner.py | 89 +++ .../fused_moe/runner/moe_runner_base.py | 160 ++++- .../layers/fused_moe/shared_fused_moe.py | 37 +- .../fused_moe/unquantized_fused_moe_method.py | 6 + .../layers/quantization/bitsandbytes.py | 8 +- .../layers/quantization/gguf.py | 8 +- vllm/model_executor/models/AXK1.py | 9 +- vllm/model_executor/models/afmoe.py | 10 +- vllm/model_executor/models/aria.py | 13 +- vllm/model_executor/models/bailing_moe.py | 7 +- .../models/bailing_moe_linear.py | 7 +- vllm/model_executor/models/dbrx.py | 8 +- vllm/model_executor/models/deepseek_eagle.py | 6 +- vllm/model_executor/models/deepseek_mtp.py | 6 +- vllm/model_executor/models/deepseek_v2.py | 5 +- vllm/model_executor/models/dots1.py | 7 +- vllm/model_executor/models/ernie45_moe.py | 10 +- vllm/model_executor/models/ernie45_vl_moe.py | 7 +- vllm/model_executor/models/exaone_moe.py | 8 +- vllm/model_executor/models/glm4_moe.py | 7 +- vllm/model_executor/models/glm4_moe_lite.py | 10 +- .../models/glm4_moe_lite_mtp.py | 9 +- vllm/model_executor/models/glm4_moe_mtp.py | 9 +- vllm/model_executor/models/gpt_oss.py | 8 +- vllm/model_executor/models/granitemoe.py | 7 +- vllm/model_executor/models/grok1.py | 7 +- vllm/model_executor/models/hunyuan_v1.py | 7 +- vllm/model_executor/models/jamba.py | 7 +- vllm/model_executor/models/kimi_linear.py | 7 +- vllm/model_executor/models/lfm2_moe.py | 7 +- vllm/model_executor/models/llama4.py | 9 +- vllm/model_executor/models/longcat_flash.py | 7 +- vllm/model_executor/models/mimo_v2_flash.py | 7 +- vllm/model_executor/models/minimax_m2.py | 7 +- vllm/model_executor/models/mixtral.py | 7 +- vllm/model_executor/models/mllama4.py | 6 +- vllm/model_executor/models/nemotron_h.py | 3 +- vllm/model_executor/models/nemotron_h_mtp.py | 6 +- vllm/model_executor/models/olmoe.py | 7 +- vllm/model_executor/models/openpangu.py | 7 +- vllm/model_executor/models/openpangu_mtp.py | 6 +- vllm/model_executor/models/phimoe.py | 7 +- vllm/model_executor/models/qwen2_moe.py | 7 +- vllm/model_executor/models/qwen3_5_mtp.py | 6 +- vllm/model_executor/models/qwen3_moe.py | 7 +- vllm/model_executor/models/qwen3_next.py | 7 +- vllm/model_executor/models/qwen3_next_mtp.py | 6 +- vllm/model_executor/models/sarvam.py | 10 +- vllm/model_executor/models/step3p5.py | 11 +- .../model_executor/models/transformers/moe.py | 8 +- .../model_executor/warmup/deep_gemm_warmup.py | 9 +- vllm/utils/__init__.py | 13 + vllm/v1/worker/gpu_model_runner.py | 7 +- 70 files changed, 772 insertions(+), 618 deletions(-) diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index 9e1d07950b4c..c10317b4e853 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -951,9 +951,6 @@ def make_fake_moe_layer( e_score_correction_bias=e_score_correction_bias, num_fused_shared_experts=0, # TODO eplb_manager=eplb_manager, - # TODO(bnell): once we can construct the MK at init time, we - # can make this a value. - indices_type_getter=lambda: indices_type, ) if quant_dtype is not None: @@ -993,6 +990,7 @@ def _moe( topk_weights, topk_ids = router.select_experts( hidden_states=hidden_states, router_logits=router_logits, + topk_indices_dtype=indices_type, ) # Shared experts use original (untransformed) hidden_states diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py index 2125f7381fe2..0b4b81f93bb4 100644 --- a/vllm/distributed/device_communicators/base_device_communicator.py +++ b/vllm/distributed/device_communicators/base_device_communicator.py @@ -7,6 +7,8 @@ import torch.distributed as dist from torch.distributed import ProcessGroup +from vllm.utils import is_moe_layer + class Cache: def __init__(self): @@ -317,16 +319,7 @@ def prepare_communication_buffer_for_model(self, model: torch.nn.Module) -> None if not self.is_ep_communicator: return - moe_modules = [ - module - for module in model.modules() - # TODO(bnell): Should use isinstance but can't. Maybe search for - # presence of quant_method.maybe_init_modular_kernel? - if ( - module.__class__.__name__ == "FusedMoE" - or module.__class__.__name__ == "SharedFusedMoE" - ) - ] + moe_modules = [module for module in model.modules() if is_moe_layer(module)] for module in moe_modules: module.maybe_init_modular_kernel() diff --git a/vllm/distributed/elastic_ep/elastic_execute.py b/vllm/distributed/elastic_ep/elastic_execute.py index e911ad66764f..67fac2358849 100644 --- a/vllm/distributed/elastic_ep/elastic_execute.py +++ b/vllm/distributed/elastic_ep/elastic_execute.py @@ -36,6 +36,7 @@ from vllm.distributed.stateless_coordinator import StatelessGroupCoordinator from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.layer import FusedMoEParallelConfig +from vllm.utils import is_moe_layer from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper from vllm.v1.worker.workspace import lock_workspace, unlock_workspace @@ -317,10 +318,7 @@ def switch_and_prepare(self) -> None: moe_modules = [ module for module in self.worker.model_runner.model.modules() - if ( - module.__class__.__name__ == "FusedMoE" - or module.__class__.__name__ == "SharedFusedMoE" - ) + if is_moe_layer(module) ] num_local_experts = moe_modules[0].moe_config.num_local_experts assert all( diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 2b9bb4c584fe..7fa95e3a7d99 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -41,6 +41,7 @@ from .utils import _get_lora_device, try_get_optimal_moe_lora_config +# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX class FusedMoEWithLoRA(BaseLayerWithLoRA): def __init__(self, base_layer: FusedMoE) -> None: super().__init__() diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index 9d3772560433..e5177055252a 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -393,6 +393,7 @@ def _parent_module(module_name: str) -> str: parts = module_name.split(".")[-1] packed_moduled_lst = self.packed_modules_mapping.get(parts, []) + # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX if isinstance(module, FusedMoE): # packed_moduled_lst is used here to just determine whether to # instantiate FusedMoE3DWithLoRA or FusedMoEWithLoRA, and the diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 75ed9674af56..8501cb3607d7 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -95,6 +95,7 @@ def get_lora_id(): } +# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX def is_moe_model(model: nn.Module) -> bool: """Checks if the model contains FusedMoE layers and warns the user.""" if any(isinstance(module, FusedMoE) for module in model.modules()): diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index f01d4932be4f..0c45d22696d2 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -18,6 +18,7 @@ ) from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, + fused_moe_make_expert_params_mapping, ) from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEActivationFormat, @@ -32,6 +33,12 @@ FusedMoERouter, ) from vllm.model_executor.layers.fused_moe.router.gate_linear import GateLinear +from vllm.model_executor.layers.fused_moe.runner.moe_runner import ( + MoERunner, +) +from vllm.model_executor.layers.fused_moe.runner.moe_runner_factory import ( + create_moe_runner, +) from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( UnquantizedFusedMoEMethod, @@ -56,23 +63,26 @@ def get_config() -> dict[str, Any] | None: __all__ = [ "FusedMoE", - "FusedMoERouter", + "FusedMoEActivationFormat", "FusedMoEConfig", - "FusedMoEMethodBase", - "MoEActivation", - "UnquantizedFusedMoEMethod", - "FusedMoeWeightScaleSupported", "FusedMoEExpertsModular", - "FusedMoEActivationFormat", + "FusedMoEMethodBase", "FusedMoEPrepareAndFinalizeModular", + "FusedMoERouter", + "FusedMoeWeightScaleSupported", "GateLinear", + "MoEActivation", + "MoERunner", "RoutedExperts", "RoutingMethodType", "SharedFusedMoE", + "UnquantizedFusedMoEMethod", "activation_without_mul", "apply_moe_activation", - "override_config", + "create_moe_runner", + "fused_moe_make_expert_params_mapping", "get_config", + "override_config", ] if HAS_TRITON: diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 31d6e172c541..2d533f1e02cf 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1,21 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable, Iterable +from collections.abc import Callable import torch import vllm.envs as envs from vllm._aiter_ops import rocm_aiter_ops from vllm.config import ParallelConfig, VllmConfig, get_current_vllm_config -from vllm.config.parallel import ExpertPlacementStrategy from vllm.distributed import ( get_dp_group, get_pcp_group, get_tensor_model_parallel_world_size, ) from vllm.logger import init_logger -from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, @@ -25,12 +23,6 @@ from vllm.model_executor.layers.fused_moe.expert_map_manager import ( ExpertMapManager, ) -from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( - FusedMoEMethodBase, -) -from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import ( - FusedMoEModularMethod, -) from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts from vllm.model_executor.layers.fused_moe.router.router_factory import ( create_fused_moe_router, @@ -53,7 +45,7 @@ def register_layer_for_moe_forward_op( vllm_config: VllmConfig, - layer: torch.nn.Module, # FusedMoE for now + layer: MoERunner, ): # For smuggling this layer into the fused moe custom op prefix = layer.layer_name @@ -125,9 +117,41 @@ def determine_expert_counts( return global_num_experts, logical_num_experts, num_fused_shared_experts -# --8<-- [start:fused_moe] -@CustomOp.register("fused_moe") -class FusedMoE(CustomOp): +def FusedMoE( + num_experts: int, # Global number of experts + top_k: int, + hidden_size: int, + intermediate_size: int, + params_dtype: torch.dtype | None = None, + renormalize: bool = True, + use_grouped_topk: bool = False, + num_expert_group: int | None = None, + topk_group: int | None = None, + quant_config: QuantizationConfig | None = None, + tp_size: int | None = None, + dp_size: int | None = None, + pcp_size: int | None = None, + prefix: str = "", + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + num_redundant_experts: int = 0, + has_bias: bool = False, + is_sequence_parallel=False, + expert_mapping: list[tuple[str, str, int, str]] | None = None, + n_shared_experts: int | None = None, + router_logits_dtype: torch.dtype | None = None, + gate: torch.nn.Module | None = None, + shared_experts: torch.nn.Module | None = None, + routed_input_transform: torch.nn.Module | None = None, + routed_output_transform: torch.nn.Module | None = None, + apply_scale_to_output: bool = False, + zero_expert_type: str | None = None, +) -> MoERunner: """FusedMoE layer for MoE models. This layer contains both MergedColumnParallel weights (gate_up_proj / @@ -148,422 +172,190 @@ class FusedMoE(CustomOp): enable_eplb: Whether to enable expert parallelism load balancer. router_logits_dtype: Data type for router logits buffers. """ + vllm_config = get_current_vllm_config() - # --8<-- [end:fused_moe] - - def __init__( - self, - num_experts: int, # Global number of experts - top_k: int, - hidden_size: int, - intermediate_size: int, - params_dtype: torch.dtype | None = None, - renormalize: bool = True, - use_grouped_topk: bool = False, - num_expert_group: int | None = None, - topk_group: int | None = None, - quant_config: QuantizationConfig | None = None, - tp_size: int | None = None, - dp_size: int | None = None, - pcp_size: int | None = None, - prefix: str = "", - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - num_redundant_experts: int = 0, - has_bias: bool = False, - is_sequence_parallel=False, - expert_mapping: list[tuple[str, str, int, str]] | None = None, - n_shared_experts: int | None = None, - router_logits_dtype: torch.dtype | None = None, - gate: torch.nn.Module | None = None, - shared_experts: torch.nn.Module | None = None, - routed_input_transform: torch.nn.Module | None = None, - routed_output_transform: torch.nn.Module | None = None, - apply_scale_to_output: bool = False, - zero_expert_type: str | None = None, - ): - super().__init__() - - vllm_config = get_current_vllm_config() - - # IMPORTANT: RoutedExperts must have same layer_name/prefix as FusedMoE for now - # This is still needed - self.layer_name = prefix - - moe_activation = MoEActivation.from_str(activation) - is_act_and_mul = moe_activation.is_gated - - moe_parallel_config = make_parallel_config( - tp_size=tp_size, - dp_size=dp_size, - pcp_size=pcp_size, - is_sequence_parallel=is_sequence_parallel, - parallel_config=vllm_config.parallel_config, - ) + layer_name = prefix + + moe_activation = MoEActivation.from_str(activation) + is_act_and_mul = moe_activation.is_gated - global_num_experts, logical_num_experts, num_fused_shared_experts = ( - determine_expert_counts( - num_experts, - num_redundant_experts, - n_shared_experts, - is_act_and_mul, - ) + moe_parallel_config = make_parallel_config( + tp_size=tp_size, + dp_size=dp_size, + pcp_size=pcp_size, + is_sequence_parallel=is_sequence_parallel, + parallel_config=vllm_config.parallel_config, + ) + + global_num_experts, logical_num_experts, num_fused_shared_experts = ( + determine_expert_counts( + num_experts, + num_redundant_experts, + n_shared_experts, + is_act_and_mul, ) + ) - # Initialize EPLB manager (or None?) - eplb_manager: EplbManager | None = None - if enable_eplb: - eplb_manager = EplbManager( - ep_size=moe_parallel_config.ep_size, - global_num_experts=global_num_experts, - num_redundant_experts=num_redundant_experts, - ) - else: - assert num_redundant_experts == 0, ( - "Redundant experts are only supported with EPLB." - ) - - # Create expert map manager - self.expert_map_manager = ExpertMapManager( - max_num_batched_tokens=vllm_config.scheduler_config.max_num_batched_tokens, - top_k=top_k, + # Initialize EPLB manager (or None?) + eplb_manager: EplbManager | None = None + if enable_eplb: + eplb_manager = EplbManager( + ep_size=moe_parallel_config.ep_size, global_num_experts=global_num_experts, - logical_num_experts=logical_num_experts, num_redundant_experts=num_redundant_experts, - num_expert_group=num_expert_group, - moe_parallel_config=moe_parallel_config, - placement_strategy=vllm_config.parallel_config.expert_placement_strategy, - enable_eplb=eplb_manager is not None, - num_fused_shared_experts=num_fused_shared_experts, - rocm_aiter_enabled=rocm_aiter_ops.is_fused_moe_enabled() and is_act_and_mul, - device=vllm_config.device_config.device, + ) + else: + assert num_redundant_experts == 0, ( + "Redundant experts are only supported with EPLB." ) - self._runner: MoERunner + # Create expert map manager + expert_map_manager = ExpertMapManager( + max_num_batched_tokens=vllm_config.scheduler_config.max_num_batched_tokens, + top_k=top_k, + global_num_experts=global_num_experts, + logical_num_experts=logical_num_experts, + num_redundant_experts=num_redundant_experts, + num_expert_group=num_expert_group, + moe_parallel_config=moe_parallel_config, + placement_strategy=vllm_config.parallel_config.expert_placement_strategy, + enable_eplb=eplb_manager is not None, + num_fused_shared_experts=num_fused_shared_experts, + rocm_aiter_enabled=rocm_aiter_ops.is_fused_moe_enabled() and is_act_and_mul, + device=vllm_config.device_config.device, + ) - # TODO(bnell): we should not have to create a router if the kernel is - # monolithic. - router = create_fused_moe_router( - top_k=top_k, - global_num_experts=global_num_experts, - renormalize=renormalize, - use_grouped_topk=use_grouped_topk, - num_expert_group=num_expert_group, - topk_group=topk_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor - if not apply_scale_to_output - else 1.0, - e_score_correction_bias=e_score_correction_bias, - num_fused_shared_experts=num_fused_shared_experts, - eplb_manager=eplb_manager, - zero_expert_type=zero_expert_type, - num_logical_experts=logical_num_experts, - # TODO(bnell): once we can construct the MK at init time, we - # can make this a value. - # THIS IS BAD - indices_type_getter=lambda: self._runner.routed_experts.quant_method.topk_indices_dtype, # noqa: E501 - ) + # TODO(bnell): we should not have to create a router if the kernel is + # monolithic. + router = create_fused_moe_router( + top_k=top_k, + global_num_experts=global_num_experts, + renormalize=renormalize, + use_grouped_topk=use_grouped_topk, + num_expert_group=num_expert_group, + topk_group=topk_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor + if not apply_scale_to_output + else 1.0, + e_score_correction_bias=e_score_correction_bias, + num_fused_shared_experts=num_fused_shared_experts, + eplb_manager=eplb_manager, + zero_expert_type=zero_expert_type, + num_logical_experts=logical_num_experts, + ) - # TODO: move this??????????? is this even needed??? - # When using zero experts, slice e_score_correction_bias to cover - # only real experts, for compatibility with monolithic kernels that - # read it directly. - if ( - False - and zero_expert_type is not None - and e_score_correction_bias is not None - ): - self.e_score_correction_bias = e_score_correction_bias[logical_num_experts] - - # FIXME (varun): We should have a better way of inferring the activation - # datatype. This works for now as the tensor datatype entering the MoE - # operation is typically unquantized (i.e. float16/bfloat16). - if vllm_config.model_config is not None: - moe_in_dtype = vllm_config.model_config.dtype - elif params_dtype is not None: - # TODO (bnell): This is a hack to get test_mixtral_moe to work - # since model_config is not set in the pytest test. - moe_in_dtype = params_dtype - else: - moe_in_dtype = torch.get_default_dtype() - - moe_config = FusedMoEConfig( - num_experts=global_num_experts, - experts_per_token=top_k, - hidden_dim=hidden_size, - intermediate_size=intermediate_size, - num_local_experts=self.expert_map_manager.local_num_experts, - num_logical_experts=logical_num_experts, - moe_parallel_config=moe_parallel_config, - in_dtype=moe_in_dtype, - moe_backend=vllm_config.kernel_config.moe_backend, - router_logits_dtype=router_logits_dtype, - max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE, - has_bias=has_bias, - is_lora_enabled=vllm_config.lora_config is not None, - activation=moe_activation, - device=vllm_config.device_config.device, - routing_method=router.routing_method_type, - # TODO: in_dtype == out_dtype? - disable_inplace=disable_inplace() or shared_experts is not None, - ) + # TODO: move this??????????? is this even needed??? + # When using zero experts, slice e_score_correction_bias to cover + # only real experts, for compatibility with monolithic kernels that + # read it directly. + # if False and zero_expert_type is not None and e_score_correction_bias is not None: + # e_score_correction_bias = e_score_correction_bias[logical_num_experts] + + # FIXME (varun): We should have a better way of inferring the activation + # datatype. This works for now as the tensor datatype entering the MoE + # operation is typically unquantized (i.e. float16/bfloat16). + if vllm_config.model_config is not None: + moe_in_dtype = vllm_config.model_config.dtype + elif params_dtype is not None: + # TODO (bnell): This is a hack to get test_mixtral_moe to work + # since model_config is not set in the pytest test. + moe_in_dtype = params_dtype + else: + moe_in_dtype = torch.get_default_dtype() + + moe_config = FusedMoEConfig( + num_experts=global_num_experts, + experts_per_token=top_k, + hidden_dim=hidden_size, + intermediate_size=intermediate_size, + num_local_experts=expert_map_manager.local_num_experts, + num_logical_experts=logical_num_experts, + moe_parallel_config=moe_parallel_config, + in_dtype=moe_in_dtype, + moe_backend=vllm_config.kernel_config.moe_backend, + router_logits_dtype=router_logits_dtype, + max_num_tokens=envs.VLLM_MOE_DP_CHUNK_SIZE, + has_bias=has_bias, + is_lora_enabled=vllm_config.lora_config is not None, + activation=moe_activation, + device=vllm_config.device_config.device, + routing_method=router.routing_method_type, + # TODO: in_dtype == out_dtype? + disable_inplace=disable_inplace() or shared_experts is not None, + ) - logger.debug("FusedMoEConfig = %s", moe_config) - - # Create RoutedExperts instance BEFORE create_weights() - # This will hold all expert weight parameters - routed_experts = RoutedExperts( - self.layer_name, - params_dtype, - hidden_size, - intermediate_size, - moe_config, - quant_config, - expert_map_manager=self.expert_map_manager, - # Extra params that are needed by quant_methods, pass along for now - top_k=top_k, - use_grouped_topk=use_grouped_topk, - num_expert_group=num_expert_group, - topk_group=topk_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - # TODO get from router? needs to be truncated? - e_score_correction_bias=e_score_correction_bias, - apply_router_weight_on_input=apply_router_weight_on_input, - activation=moe_activation, - ) + logger.debug("FusedMoEConfig = %s", moe_config) + + # Create RoutedExperts instance BEFORE create_weights() + # This will hold all expert weight parameters + routed_experts = RoutedExperts( + layer_name, + params_dtype, + hidden_size, + intermediate_size, + moe_config, + quant_config, + expert_map_manager=expert_map_manager, + # Extra params that are needed by quant_methods, pass along for now + top_k=top_k, + use_grouped_topk=use_grouped_topk, + num_expert_group=num_expert_group, + topk_group=topk_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + # TODO get from router? needs to be truncated? + e_score_correction_bias=e_score_correction_bias, + apply_router_weight_on_input=apply_router_weight_on_input, + activation=moe_activation, + ) - # TODO(bnell): this needs to be stored as a parameter for weight loading. - # ditch this eventually. - self.routed_experts = routed_experts - - # Storing the runner in the FusedMoE is an intermediate state, eventually - # the runner will own the FusedMoE layer and provide the execution interface - # for MoE ops. - self._runner = create_moe_runner( - layer_name=self.layer_name, - moe_config=moe_config, - router=router, - routed_input_transform=routed_input_transform, - routed_output_transform=routed_output_transform, - gate=gate, - shared_experts=shared_experts, - routed_experts=routed_experts, - enable_dbo=vllm_config.parallel_config.enable_dbo, - apply_scale_to_output=apply_scale_to_output, - routed_scaling_factor=routed_scaling_factor, - ) + # TODO(bnell): this needs to be stored as a parameter for weight loading. + # ditch this eventually. + + # Storing the runner in the FusedMoE is an intermediate state, eventually + # the runner will own the FusedMoE layer and provide the execution interface + # for MoE ops. + runner = create_moe_runner( + layer_name=layer_name, + moe_config=moe_config, + router=router, + routed_input_transform=routed_input_transform, + routed_output_transform=routed_output_transform, + gate=gate, + shared_experts=shared_experts, + routed_experts=routed_experts, + enable_dbo=vllm_config.parallel_config.enable_dbo, + apply_scale_to_output=apply_scale_to_output, + routed_scaling_factor=routed_scaling_factor, + ) - # HACK XXXXXXXXXXXXXXXXXXXXXXXX - # This is needed by various _setup_kernels in quant methods. - routed_experts.shared_experts = self._runner.shared_experts - - # For smuggling this layer into the fused moe custom op - register_layer_for_moe_forward_op(vllm_config, self) - - # TODO(bnell): This method is provided as a hook so vllm/lora/layers/fused_moe.py - # and vllm/distributed/elastic_ep/elastic_execute.py - # can safely swap out the quant_method. We should figure out a less - # intrusive way to do this. - def _replace_quant_method(self, mk: FusedMoEMethodBase): - self._runner._replace_quant_method(mk) - - # Note: maybe_init_modular_kernel should only be called by - # prepare_communication_buffer_for_model. - # This is called after all weight loading and post-processing, so it - # should be safe to swap out the quant_method. - def maybe_init_modular_kernel(self) -> None: - # NOTE(rob): WIP refactor. For quant methods that own the MK - # we create the MK during process_weights_after_loading. - if ( - self._runner.routed_experts.quant_method.supports_internal_mk - or self._runner.routed_experts.quant_method.is_monolithic - ): - return None - - self._runner.routed_experts._ensure_moe_quant_config_init() - # routing_tables only needed for round-robin expert placement with - # DeepEP all2all backend. - routing_tables = self._maybe_init_expert_routing_tables() - - if isinstance(self._runner.routed_experts.quant_method, FusedMoEModularMethod): - base_quant_method = ( - self._runner.routed_experts.quant_method.old_quant_method - ) - else: - base_quant_method = self._runner.routed_experts.quant_method - - prepare_finalize = base_quant_method.maybe_make_prepare_finalize( - routing_tables=routing_tables - ) - if prepare_finalize is not None: - logger.debug( - "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self) - ) - self._replace_quant_method( - FusedMoEModularMethod.make( - self, - base_quant_method, - prepare_finalize, - self._runner.shared_experts, - inplace=not base_quant_method.moe.disable_inplace, - ) - ) - - # - # Properties - # - - @property - def layer_id(self): - # Delayed import to avoid circular dependency - from vllm.model_executor.models.utils import extract_layer_index - - return extract_layer_index(self.layer_name) - - # - # Attributes still needed by models - # - - @property - def is_monolithic(self) -> bool: - return self._runner.routed_experts.quant_method.is_monolithic - - @property - def activation(self) -> MoEActivation: - return self._runner.routed_experts.activation - - @property - def is_internal_router(self) -> bool: - # By default, router/gate is called before FusedMoE forward pass - return self._runner.is_internal_router - - # - # Expert maps - # - - @property - def expert_placement_strategy(self) -> ExpertPlacementStrategy: - return self.expert_map_manager.placement_strategy - - @property - def expert_global_to_physical(self) -> torch.Tensor | None: - tables = self.expert_map_manager.routing_tables - return tables[0] if tables else None - - @property - def expert_physical_to_global(self) -> torch.Tensor | None: - """Routing table: physical expert ID to global expert ID.""" - tables = self.expert_map_manager.routing_tables - return tables[1] if tables else None - - @property - def expert_local_to_global(self) -> torch.Tensor | None: - """Routing table: local expert ID to global expert ID.""" - tables = self.expert_map_manager.routing_tables - return tables[2] if tables else None - - @property - def expert_map(self) -> torch.Tensor | None: - return self._runner.routed_experts.expert_map - - def _maybe_init_expert_routing_tables( - self, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: - return self._runner.routed_experts._maybe_init_expert_routing_tables() - - def update_expert_map(self): - self._runner.routed_experts.update_expert_map() - - def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: - """Map global expert ID to local expert ID.""" - return self._runner.routed_experts._map_global_expert_id_to_local_expert_id( - expert_id - ) + # HACK XXXXXXXXXXXXXXXXXXXXXXXX + # This is needed by various _setup_kernels in quant methods. + routed_experts.shared_experts = runner.shared_experts - # - # EPLB - # - - def get_expert_weights(self) -> Iterable[torch.Tensor]: - """Delegate to EPLB manager.""" - if self._runner.router.eplb_manager is not None: - return self._runner.router.eplb_manager.get_expert_weights( - self.routed_experts - ) - else: - return [] - - def set_eplb_state( - self, - moe_layer_idx: int, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - """ - Register the EPLB state in this layer. - - This is used later in forward pass, where we get the expert mapping - and record the load metrics in `expert_load_view`. - """ - if self._runner.router.eplb_manager is not None: - self._runner.router.eplb_manager.set_state( - moe_layer_idx, - expert_load_view, - logical_to_physical_map, - logical_replica_count, - ) - - # - # Weight loading - # - - @classmethod - def make_expert_params_mapping( - cls, - model: torch.nn.Module, - ckpt_gate_proj_name: str, - ckpt_down_proj_name: str, - ckpt_up_proj_name: str, - num_experts: int, - num_redundant_experts: int = 0, - ) -> list[tuple[str, str, int, str]]: - """Delegate to EPLB manager.""" - return RoutedExperts.make_expert_params_mapping( - model, - ckpt_gate_proj_name, - ckpt_down_proj_name, - ckpt_up_proj_name, - num_experts, - num_redundant_experts, - ) + # For smuggling this layer into the fused moe custom op + register_layer_for_moe_forward_op(vllm_config, runner) - # - # Execution - # - - def forward_native( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - ) -> torch.Tensor: - return self._runner.forward( - hidden_states, - router_logits, - ) + return runner - def forward_cuda( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - ) -> torch.Tensor: - return self.forward_native(hidden_states, router_logits) + +def fused_moe_make_expert_params_mapping( + model: torch.nn.Module, + ckpt_gate_proj_name: str, + ckpt_down_proj_name: str, + ckpt_up_proj_name: str, + num_experts: int, + num_redundant_experts: int = 0, +) -> list[tuple[str, str, int, str]]: + """Delegate to EPLB manager.""" + return RoutedExperts.make_expert_params_mapping( + model, + ckpt_gate_proj_name, + ckpt_down_proj_name, + ckpt_up_proj_name, + num_experts, + num_redundant_experts, + ) diff --git a/vllm/model_executor/layers/fused_moe/router/base_router.py b/vllm/model_executor/layers/fused_moe/router/base_router.py index 605cf2e1546b..0349123e9bda 100644 --- a/vllm/model_executor/layers/fused_moe/router/base_router.py +++ b/vllm/model_executor/layers/fused_moe/router/base_router.py @@ -110,9 +110,6 @@ def __init__( top_k: int, global_num_experts: int, eplb_manager: EplbManager | None = None, - # TODO(bnell): Once the MK is constructed at layer init time, we - # can make this a plain value instead of a callback. - indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): """ Note: the indices dtype might not be available at router construction @@ -124,7 +121,6 @@ def __init__( self.top_k = top_k self.global_num_experts = global_num_experts self._eplb_manager = eplb_manager - self.indices_type_getter = indices_type_getter self.capture_fn: Callable[[torch.Tensor], None] | None = None @property @@ -150,12 +146,6 @@ def _validate_eplb_state(self) -> None: "enable_eplb=True requires logical_replica_count != None" ) - def _get_indices_type(self) -> torch.dtype | None: - """Get the desired indices dtype from the getter function.""" - return ( - self.indices_type_getter() if self.indices_type_getter is not None else None - ) - def _apply_eplb_mapping(self, topk_ids: torch.Tensor) -> torch.Tensor: """Apply EPLB mapping to convert logical expert IDs to physical expert IDs.""" if self.eplb_manager is not None: @@ -208,6 +198,7 @@ def select_experts( self, hidden_states: torch.Tensor, router_logits: torch.Tensor, + topk_indices_dtype: torch.dtype | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: """ Route the input hidden states to the top-k experts based on the @@ -215,10 +206,9 @@ def select_experts( This method implements the template method pattern: 1. Validates EPLB state - 2. Gets indices type - 3. Calls _compute_routing() to get topk_weights and topk_ids - 4. Applies EPLB mapping if enabled - 5. Converts indices dtype if needed + 2. Calls _compute_routing() to get topk_weights and topk_ids + 3. Applies EPLB mapping if enabled + 4. Converts indices dtype if needed Returns: (topk_weights, topk_ids) @@ -232,12 +222,9 @@ def select_experts( # Step 1: Validate EPLB state self._validate_eplb_state() - # Step 2: Get indices type. - indices_type = self._get_indices_type() - # Step 3: Compute routing (delegated to subclass) topk_weights, topk_ids = self._compute_routing( - hidden_states, router_logits, indices_type + hidden_states, router_logits, topk_indices_dtype ) # Capture logical ids before EPLB mapping. @@ -248,6 +235,6 @@ def select_experts( topk_ids = self._apply_eplb_mapping(topk_ids) # Step 5: Convert indices dtype - topk_ids = self._convert_indices_dtype(topk_ids, indices_type) + topk_ids = self._convert_indices_dtype(topk_ids, topk_indices_dtype) return topk_weights, topk_ids diff --git a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py index 5cf7061fbf37..7bde838a92d2 100644 --- a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py +++ b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py @@ -19,13 +19,11 @@ def __init__( custom_routing_function: Callable, eplb_manager: EplbManager | None = None, renormalize: bool = True, - indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, eplb_manager=eplb_manager, - indices_type_getter=indices_type_getter, ) self.custom_routing_function = custom_routing_function self.renormalize = renormalize diff --git a/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py b/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py index e8151069a4cb..32334f6e9fe9 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py @@ -37,6 +37,7 @@ def select_experts( self, hidden_states: torch.Tensor, router_logits: torch.Tensor, + topk_indices_dtype: torch.dtype | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: """ Route the input hidden states to the top-k experts based on the diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py index f5071996a722..03d20a4bca47 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools -from collections.abc import Callable import torch @@ -182,13 +181,11 @@ def __init__( renormalize: bool = True, routed_scaling_factor: float = 1.0, eplb_manager: EplbManager | None = None, - indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, eplb_manager=eplb_manager, - indices_type_getter=indices_type_getter, ) self.e_score_correction_bias = e_score_correction_bias self.renormalize = renormalize diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py index dc0390a0348c..03150cf11748 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py @@ -123,13 +123,11 @@ def __init__( scoring_func: str = "softmax", renormalize: bool = True, eplb_manager: EplbManager | None = None, - indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, eplb_manager=eplb_manager, - indices_type_getter=indices_type_getter, ) self.renormalize = renormalize self.scoring_func = scoring_func diff --git a/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py b/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py index a427f287c14a..71fd3ceef00b 100644 --- a/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py +++ b/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from functools import partial import torch @@ -262,13 +261,11 @@ def __init__( e_score_correction_bias: torch.Tensor | None = None, num_fused_shared_experts: int = 0, eplb_manager: EplbManager | None = None, - indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, eplb_manager=eplb_manager, - indices_type_getter=indices_type_getter, ) self.num_expert_group = num_expert_group self.topk_group = topk_group diff --git a/vllm/model_executor/layers/fused_moe/router/router_factory.py b/vllm/model_executor/layers/fused_moe/router/router_factory.py index 502891ea29d8..187d0f7eb12a 100644 --- a/vllm/model_executor/layers/fused_moe/router/router_factory.py +++ b/vllm/model_executor/layers/fused_moe/router/router_factory.py @@ -35,7 +35,6 @@ def create_fused_moe_router( top_k: int, global_num_experts: int, renormalize: bool = True, - indices_type_getter: Callable[[], torch.dtype | None] | None = None, # grouped topk parameters use_grouped_topk: bool = False, num_expert_group: int | None = None, @@ -69,7 +68,6 @@ def create_fused_moe_router( top_k: Number of experts to select per token global_num_experts: Total number of experts in the model renormalize: Whether to renormalize the routing weights - indices_type_getter: Function to get the desired indices dtype routing_method_type: Optional explicit routing method type Grouped topk arguments: @@ -105,7 +103,6 @@ def create_fused_moe_router( top_k=top_k, global_num_experts=global_num_experts, eplb_manager=eplb_manager, - indices_type_getter=indices_type_getter, ) if zero_expert_type is not None: @@ -125,7 +122,6 @@ def create_fused_moe_router( renormalize=renormalize, routed_scaling_factor=routed_scaling_factor, eplb_manager=eplb_manager, - indices_type_getter=indices_type_getter, ) if use_grouped_topk: @@ -146,7 +142,6 @@ def create_fused_moe_router( e_score_correction_bias=e_score_correction_bias, num_fused_shared_experts=num_fused_shared_experts, eplb_manager=eplb_manager, - indices_type_getter=indices_type_getter, ) if ( grouped_topk_router.routing_method_type != RoutingMethodType.Unspecified @@ -168,7 +163,6 @@ def create_fused_moe_router( custom_routing_function=custom_routing_function, renormalize=renormalize, eplb_manager=eplb_manager, - indices_type_getter=indices_type_getter, ) if e_score_correction_bias is not None: @@ -180,7 +174,6 @@ def create_fused_moe_router( renormalize=renormalize, routed_scaling_factor=routed_scaling_factor, eplb_manager=eplb_manager, - indices_type_getter=indices_type_getter, ) return FusedTopKRouter( @@ -189,5 +182,4 @@ def create_fused_moe_router( renormalize=renormalize, scoring_func=scoring_func, eplb_manager=eplb_manager, - indices_type_getter=indices_type_getter, ) diff --git a/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py b/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py index 5d5ee87da28c..2cb36a922af8 100644 --- a/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py +++ b/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod -from collections.abc import Callable from typing import Any import torch @@ -314,13 +313,11 @@ def __init__( top_k: int, global_num_experts: int, eplb_manager: EplbManager | None = None, - indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, eplb_manager=eplb_manager, - indices_type_getter=indices_type_getter, ) @property diff --git a/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py b/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py index e202e324d1b4..333d0ac164ba 100644 --- a/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py +++ b/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable import torch @@ -39,13 +38,11 @@ def __init__( renormalize: bool = False, routed_scaling_factor: float = 1.0, eplb_manager: EplbManager | None = None, - indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, eplb_manager=eplb_manager, - indices_type_getter=indices_type_getter, ) self.e_score_correction_bias = e_score_correction_bias self.num_logical_experts = num_logical_experts diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index 14b28f123b67..0dc7dcdd6810 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -1,9 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import abstractmethod +from collections.abc import Iterable import torch +from vllm.config.parallel import ExpertPlacementStrategy +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, ) @@ -48,3 +51,89 @@ def is_internal_router(self) -> bool: @abstractmethod def _replace_quant_method(self, quant_method: FusedMoEMethodBase): raise NotImplementedError + + ######################################################################## + # + # FusedMoE layer methods + # + ######################################################################## + + @abstractmethod + def maybe_init_modular_kernel(self) -> None: + raise NotImplementedError + + @property + @abstractmethod + def layer_id(self): + raise NotImplementedError + + # + # Attributes still needed by models + # + + @property + @abstractmethod + def is_monolithic(self) -> bool: + raise NotImplementedError + + @property + @abstractmethod + def activation(self) -> MoEActivation: + raise NotImplementedError + + # + # Expert maps + # + + @property + @abstractmethod + def expert_placement_strategy(self) -> ExpertPlacementStrategy: + raise NotImplementedError + + @property + @abstractmethod + def expert_global_to_physical(self) -> torch.Tensor | None: + raise NotImplementedError + + @property + @abstractmethod + def expert_physical_to_global(self) -> torch.Tensor | None: + raise NotImplementedError + + @property + @abstractmethod + def expert_local_to_global(self) -> torch.Tensor | None: + raise NotImplementedError + + @property + @abstractmethod + def expert_map(self) -> torch.Tensor | None: + raise NotImplementedError + + @abstractmethod + def _maybe_init_expert_routing_tables( + self, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: + raise NotImplementedError + + @abstractmethod + def update_expert_map(self): + raise NotImplementedError + + @abstractmethod + def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: + raise NotImplementedError + + @abstractmethod + def get_expert_weights(self) -> Iterable[torch.Tensor]: + raise NotImplementedError + + @abstractmethod + def set_eplb_state( + self, + moe_layer_idx: int, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + raise NotImplementedError diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py index e2c6227858a5..914379c3bf8c 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py @@ -1,13 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import abstractmethod -from collections.abc import Callable +from collections.abc import Callable, Iterable from contextlib import nullcontext from typing import TYPE_CHECKING import torch import torch.nn.functional as F +from vllm.config.parallel import ExpertPlacementStrategy from vllm.distributed import ( tensor_model_parallel_all_reduce, ) @@ -17,12 +18,16 @@ is_forward_context_available, ) from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, ) from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, ) +from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import ( + FusedMoEModularMethod, +) from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( FusedMoERouter, @@ -45,7 +50,7 @@ logger = init_logger(__name__) -def get_layer_from_name(layer_name: str) -> torch.nn.Module: # FusedMoE +def get_layer_from_name(layer_name: str) -> MoERunner: forward_context: ForwardContext = get_forward_context() if layer_name == "from_forward_context": all_moe_layers = forward_context.all_moe_layers @@ -60,7 +65,7 @@ def get_layer_from_name(layer_name: str) -> torch.nn.Module: # FusedMoE layer_name = all_moe_layers[moe_layer_index] forward_context.moe_layer_index += 1 layer = forward_context.no_compile_layers[layer_name] - # assert isinstance(layer, FusedMoE) + assert isinstance(layer, MoERunner) return layer @@ -90,7 +95,7 @@ def _moe_forward( layer_name: _layer_name_type, ) -> torch.Tensor: layer = get_layer_from_name(_resolve_layer_name(layer_name)) - return layer._runner._forward_dispatch( + return layer._forward_dispatch( hidden_states, router_logits, shared_experts_input, @@ -113,7 +118,7 @@ def _moe_forward_shared( layer_name: _layer_name_type, ) -> tuple[torch.Tensor, torch.Tensor]: layer = get_layer_from_name(_resolve_layer_name(layer_name)) - return layer._runner._forward_dispatch( + return layer._forward_dispatch( hidden_states, router_logits, shared_experts_input, @@ -220,7 +225,7 @@ def __init__( ) self.routed_scaling_factor = routed_scaling_factor - # Needed for string -> FusedMoE layer lookup in custom ops. + # Needed for string -> MoERunner layer lookup in custom ops. self.layer_name = layer_name self._forward_entry = self._select_forward() @@ -474,6 +479,7 @@ def _apply_quant_method( topk_weights, topk_ids = self.router.select_experts( hidden_states=hidden_states, router_logits=router_logits, + topk_indices_dtype=self.routed_experts.quant_method.topk_indices_dtype, ) fused_out = self.routed_experts.forward( @@ -678,3 +684,145 @@ def _forward_impl( are present. """ raise NotImplementedError + + # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX + # + # Old methods from FusedMoE layer + # + # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX + + # Note: maybe_init_modular_kernel should only be called by + # prepare_communication_buffer_for_model. + # This is called after all weight loading and post-processing, so it + # should be safe to swap out the quant_method. + def maybe_init_modular_kernel(self) -> None: + # NOTE(rob): WIP refactor. For quant methods that own the MK + # we create the MK during process_weights_after_loading. + if ( + self.routed_experts.quant_method.supports_internal_mk + or self.routed_experts.quant_method.is_monolithic + ): + return None + + self.routed_experts._ensure_moe_quant_config_init() + # routing_tables only needed for round-robin expert placement with + # DeepEP all2all backend. + routing_tables = self._maybe_init_expert_routing_tables() + + if isinstance(self.routed_experts.quant_method, FusedMoEModularMethod): + base_quant_method = self.routed_experts.quant_method.old_quant_method + else: + base_quant_method = self.routed_experts.quant_method + + prepare_finalize = base_quant_method.maybe_make_prepare_finalize( + routing_tables=routing_tables + ) + if prepare_finalize is not None: + logger.debug( + "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self) + ) + self._replace_quant_method( + FusedMoEModularMethod.make( + self, + base_quant_method, + prepare_finalize, + self.shared_experts, + inplace=not base_quant_method.moe.disable_inplace, + ) + ) + + # + # Properties + # + + @property + def layer_id(self): + # Delayed import to avoid circular dependency + from vllm.model_executor.models.utils import extract_layer_index + + return extract_layer_index(self.layer_name) + + # + # Attributes still needed by models + # + + @property + def is_monolithic(self) -> bool: + return self.routed_experts.quant_method.is_monolithic + + @property + def activation(self) -> MoEActivation: + return self.routed_experts.activation + + # + # Expert maps + # + + @property + def expert_placement_strategy(self) -> ExpertPlacementStrategy: + return self.expert_map_manager.placement_strategy + + @property + def expert_global_to_physical(self) -> torch.Tensor | None: + tables = self.expert_map_manager.routing_tables + return tables[0] if tables else None + + @property + def expert_physical_to_global(self) -> torch.Tensor | None: + """Routing table: physical expert ID to global expert ID.""" + tables = self.expert_map_manager.routing_tables + return tables[1] if tables else None + + @property + def expert_local_to_global(self) -> torch.Tensor | None: + """Routing table: local expert ID to global expert ID.""" + tables = self.expert_map_manager.routing_tables + return tables[2] if tables else None + + @property + def expert_map(self) -> torch.Tensor | None: + return self.routed_experts.expert_map + + def _maybe_init_expert_routing_tables( + self, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: + return self.routed_experts._maybe_init_expert_routing_tables() + + def update_expert_map(self): + self.routed_experts.update_expert_map() + + def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: + """Map global expert ID to local expert ID.""" + return self.routed_experts._map_global_expert_id_to_local_expert_id(expert_id) + + # + # EPLB + # + + def get_expert_weights(self) -> Iterable[torch.Tensor]: + """Delegate to EPLB manager.""" + if self.router.eplb_manager is not None: + return self.router.eplb_manager.get_expert_weights(self.routed_experts) + else: + return [] + + def set_eplb_state( + self, + moe_layer_idx: int, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + """ + Register the EPLB state in this layer. + + This is used later in forward pass, where we get the expert mapping + and record the load metrics in `expert_load_view`. + """ + if self.router.eplb_manager is not None: + self.router.eplb_manager.set_state( + moe_layer_idx, + expert_load_view, + logical_to_physical_map, + logical_replica_count, + ) diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py index 9cfcb1baa9bb..c2ffbeedc41d 100644 --- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py @@ -1,25 +1,28 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import torch from vllm.model_executor.layers.fused_moe.layer import FusedMoE - +from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner # TODO(bnell): Remove this entirely -class SharedFusedMoE(FusedMoE): - """ - A FusedMoE operation that also computes the results of shared experts. - If an all2all communicator is being used the shared expert computation - can be interleaved with the fused all2all dispatch communication step. - """ +# class SharedFusedMoE(FusedMoE): +# """ +# A FusedMoE operation that also computes the results of shared experts. +# If an all2all communicator is being used the shared expert computation +# can be interleaved with the fused all2all dispatch communication step. +# """ + +# def forward( +# self, +# hidden_states: torch.Tensor, +# router_logits: torch.Tensor, +# ) -> torch.Tensor: +# return super().forward( +# hidden_states=hidden_states, +# router_logits=router_logits, +# ) + - def forward( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - ) -> torch.Tensor: - return super().forward( - hidden_states=hidden_states, - router_logits=router_logits, - ) +def SharedFusedMoE(*args, **kwargs) -> MoERunner: + return FusedMoE(*args, **kwargs) diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 8a80f1d1260e..de100fde9387 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -235,6 +235,12 @@ def _setup_kernel( moe_config=self.moe, ) + @property + def topk_indices_dtype(self) -> torch.dtype | None: + if self.kernel is not None: + return self.kernel.prepare_finalize.topk_indices_dtype() + return None + def process_weights_after_loading(self, layer: "RoutedExperts") -> None: super().process_weights_after_loading(layer) diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 6aa4af85ae4f..0bfb146f1eca 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -6,14 +6,14 @@ import torch from packaging import version +from vllm.model_executor.layers.fused_moe import ( + FusedMoEMethodBase, + RoutedExperts, +) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, ) -from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoEMethodBase, - RoutedExperts, -) from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index a60ee3d82b34..37ccd9ab28d1 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -12,6 +12,10 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe import ( + FusedMoEMethodBase, + RoutedExperts, +) from vllm.model_executor.layers.fused_moe.activation import ( MoEActivation, apply_moe_activation, @@ -20,10 +24,6 @@ FusedMoEConfig, FusedMoEQuantConfig, ) -from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoEMethodBase, - RoutedExperts, -) from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, diff --git a/vllm/model_executor/models/AXK1.py b/vllm/model_executor/models/AXK1.py index 05e5a77fea37..84ce761b69d0 100644 --- a/vllm/model_executor/models/AXK1.py +++ b/vllm/model_executor/models/AXK1.py @@ -42,7 +42,10 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + SharedFusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -940,7 +943,7 @@ def compute_logits( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return SharedFusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", @@ -974,7 +977,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = SharedFusedMoE.make_expert_params_mapping( + expert_params_mapping = fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index 5bad52a0c496..ba8dd73d4f7b 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -18,7 +18,11 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe.shared_fused_moe import ( + MoERunner, + SharedFusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -479,7 +483,7 @@ def make_empty_intermediate_tensors( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return SharedFusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", @@ -637,7 +641,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.num_moe_layers = config.num_hidden_layers - config.num_dense_layers self.num_expert_groups = config.n_group - self.moe_layers: list[SharedFusedMoE] = [] + self.moe_layers: list[MoERunner] = [] example_moe = None for layer in self.model.layers: if isinstance(layer, PPMissingLayer): diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 6174cd9d1bf5..769aa23ffe66 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -13,7 +13,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_rank from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig @@ -214,7 +214,16 @@ def forward( return out -class AriaFusedMoE(SharedFusedMoE): +# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXx +class AriaFusedMoE(torch.nn.Module): + def __init__(self, *args, **kwargs): + self.moe = FusedMoE(*args, **kwargs) + + def forward( + self, hidden_states: torch.Tensor, router_logits: torch.Tensor + ) -> torch.Tensor: + return self.moe(hidden_states, router_logits) + def weight_loader( self, param: nn.Parameter, loaded_weight: torch.Tensor, shard_id: str ) -> None: diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index 93884a6a05ff..587c56d80e1a 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -41,7 +41,10 @@ ) from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + SharedFusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -461,7 +464,7 @@ def forward( return hidden_states def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return SharedFusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/bailing_moe_linear.py b/vllm/model_executor/models/bailing_moe_linear.py index 3dc4b0898b6d..72c56403f720 100644 --- a/vllm/model_executor/models/bailing_moe_linear.py +++ b/vllm/model_executor/models/bailing_moe_linear.py @@ -21,7 +21,10 @@ RMSNormGated, layernorm_fn, ) -from vllm.model_executor.layers.fused_moe import FusedMoE, SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + SharedFusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -1004,7 +1007,7 @@ def forward( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: """Get expert parameter mapping for MoE layers.""" - return FusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index a72f4e487164..86a24f0e2fc9 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -71,7 +71,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return router_logits -class DbrxExperts(FusedMoE): +# XXXXXXXXXXXXXXXXXXXXXXXXXXXXX +class DbrxExperts(torch.nn.Module): def __init__( self, config: DbrxConfig, @@ -79,7 +80,7 @@ def __init__( params_dtype: torch.dtype | None = None, prefix: str = "", ): - super().__init__( + self.moe = FusedMoE( num_experts=config.ffn_config.moe_num_experts, top_k=config.ffn_config.moe_top_k, hidden_size=config.d_model, @@ -94,6 +95,9 @@ def __init__( self.d_model = config.d_model self.intermediate_size = self.config.ffn_config.ffn_hidden_size // self.tp_size + def forward(self, router_logits, hidden_states) -> torch.Tensor: + return self.moe(hidden_states, router_logits) + # Define custom weight loader for dbrx model def weight_loader( self, diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py index 5c439cdf486d..f975b32adc19 100644 --- a/vllm/model_executor/models/deepseek_eagle.py +++ b/vllm/model_executor/models/deepseek_eagle.py @@ -8,7 +8,9 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -105,7 +107,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = FusedMoE.make_expert_params_mapping( + expert_params_mapping = fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index c75ee1a1bbfe..67d677567a43 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -11,7 +11,9 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig @@ -243,7 +245,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1), ] - expert_params_mapping = SharedFusedMoE.make_expert_params_mapping( + expert_params_mapping = fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index b50236c6b461..178787637891 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -51,6 +51,7 @@ GateLinear, RoutingMethodType, SharedFusedMoE, + fused_moe_make_expert_params_mapping, ) from vllm.model_executor.layers.layernorm import LayerNorm, RMSNorm from vllm.model_executor.layers.linear import ( @@ -1386,7 +1387,7 @@ def compute_logits( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return SharedFusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", @@ -1420,7 +1421,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = SharedFusedMoE.make_expert_params_mapping( + expert_params_mapping = fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index f0bbde4bcdbe..a591a1a4bf72 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -40,7 +40,10 @@ ) from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + SharedFusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -413,7 +416,7 @@ def forward( return hidden_states def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return SharedFusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index c92e230bcd21..58be404898ff 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -42,7 +42,11 @@ from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + MoERunner, + SharedFusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -485,7 +489,7 @@ def forward( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return SharedFusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", @@ -667,7 +671,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.num_moe_layers = len(moe_layers_indices) self.num_expert_groups = 1 - self.moe_layers: list[SharedFusedMoE] = [] + self.moe_layers: list[MoERunner] = [] example_moe = None for layer in self.model.layers: if isinstance(layer, PPMissingLayer): diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index fa70f1c7acf0..3f74976c51ac 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -36,7 +36,10 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + SharedFusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( QKVParallelLinear, @@ -649,7 +652,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = SharedFusedMoE.make_expert_params_mapping( + expert_params_mapping = fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/exaone_moe.py b/vllm/model_executor/models/exaone_moe.py index 40b12d66e4d2..45b035299e34 100644 --- a/vllm/model_executor/models/exaone_moe.py +++ b/vllm/model_executor/models/exaone_moe.py @@ -30,8 +30,10 @@ get_pp_group, get_tensor_model_parallel_world_size, ) -from vllm.model_executor.layers.fused_moe import FusedMoE -from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe.shared_fused_moe import ( + SharedFusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -327,7 +329,7 @@ def forward( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return FusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index cef934222382..f622d224e6d5 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -42,7 +42,10 @@ from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + SharedFusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -465,7 +468,7 @@ def forward( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return SharedFusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/glm4_moe_lite.py b/vllm/model_executor/models/glm4_moe_lite.py index 6d96f748e3ea..77aaa179aa52 100644 --- a/vllm/model_executor/models/glm4_moe_lite.py +++ b/vllm/model_executor/models/glm4_moe_lite.py @@ -41,7 +41,9 @@ get_pp_group, ) from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -308,7 +310,7 @@ def make_empty_intermediate_tensors( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return SharedFusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", @@ -334,7 +336,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = SharedFusedMoE.make_expert_params_mapping( + expert_params_mapping = fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", @@ -616,7 +618,7 @@ def compute_logits( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return SharedFusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/glm4_moe_lite_mtp.py b/vllm/model_executor/models/glm4_moe_lite_mtp.py index efa96c40d042..4813af5f0307 100644 --- a/vllm/model_executor/models/glm4_moe_lite_mtp.py +++ b/vllm/model_executor/models/glm4_moe_lite_mtp.py @@ -32,7 +32,10 @@ from vllm._aiter_ops import rocm_aiter_ops from vllm.config import VllmConfig -from vllm.model_executor.layers.fused_moe import FusedMoE, SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + MoERunner, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig @@ -212,7 +215,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.num_moe_layers = self.config.num_nextn_predict_layers self.num_expert_groups = self.config.n_group - self.moe_layers: list[FusedMoE] = [] + self.moe_layers: list[MoERunner] = [] self.moe_mlp_layers: list[Glm4MoeLite] = [] example_moe = None for layer in self.model.layers.values(): @@ -260,7 +263,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1), ] - expert_params_mapping = SharedFusedMoE.make_expert_params_mapping( + expert_params_mapping = fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py index cde94673e53a..d87ad268285b 100644 --- a/vllm/model_executor/models/glm4_moe_mtp.py +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -31,7 +31,10 @@ from transformers import PretrainedConfig from vllm.config import CacheConfig, ParallelConfig, VllmConfig -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + MoERunner, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig @@ -198,7 +201,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.num_moe_layers = self.config.num_nextn_predict_layers self.num_expert_groups = self.config.n_group - self.moe_layers: list[FusedMoE] = [] + self.moe_layers: list[MoERunner] = [] self.moe_mlp_layers: list[Glm4MoE] = [] example_moe = None for layer in self.model.layers.values(): @@ -247,7 +250,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = FusedMoE.make_expert_params_mapping( + expert_params_mapping = fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 96f1b22f902c..83c0202a9859 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -20,7 +20,11 @@ tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + GateLinear, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( @@ -328,7 +332,7 @@ def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, weight scales, activation scales # (param_name, weight_name, expert_id, shard_id) # NOTE: this is only used for quark. - return FusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="w1", ckpt_down_proj_name="w2", diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index f57a8c942bb4..e3585a6dd746 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -39,7 +39,10 @@ tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( QKVParallelLinear, @@ -351,7 +354,7 @@ def _load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str] # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = FusedMoE.make_expert_params_mapping( + expert_params_mapping = fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="w1", ckpt_down_proj_name="w2", diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index c9aa3d2068f0..f06122a7fd19 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -38,7 +38,10 @@ from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -519,7 +522,7 @@ def forward( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Map expert parameter names to standard names num_experts = _get_num_experts(self.config) - return FusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name=self.ckpt_gate_proj_name, ckpt_down_proj_name=self.ckpt_down_proj_name, diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index b5967fdebab6..949ec4638ad5 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -42,7 +42,10 @@ ) from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + SharedFusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -712,7 +715,7 @@ def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: if _is_moe(self.config): # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return SharedFusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index b4b3b6873db3..84e96def6c1f 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -14,7 +14,10 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( QKVParallelLinear, @@ -378,7 +381,7 @@ def forward( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return FusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py index e586a3ac3469..bb4586beb9d7 100644 --- a/vllm/model_executor/models/kimi_linear.py +++ b/vllm/model_executor/models/kimi_linear.py @@ -14,7 +14,10 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + SharedFusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.kda import KimiDeltaAttention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( @@ -476,7 +479,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if self.config.is_moe: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = SharedFusedMoE.make_expert_params_mapping( + expert_params_mapping = fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="w1", ckpt_down_proj_name="w2", diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index 4b49430c1faf..55b00d2b9ea2 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -15,7 +15,10 @@ ) from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -482,7 +485,7 @@ def forward( return hidden_states def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return FusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="w1", ckpt_down_proj_name="w2", diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index a1c0ac896052..e08885de89e7 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -36,7 +36,10 @@ Attention, ChunkedLocalAttention, ) -from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + SharedFusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( QKVParallelLinear, @@ -554,7 +557,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: fused_experts_params = False # Expert parameter mapping for the case where the expert weights are # not fused into a single weight tensor. - expert_params_mapping = SharedFusedMoE.make_expert_params_mapping( + expert_params_mapping = fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", @@ -564,7 +567,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: ) # Expert parameter mapping for the case where the expert weights are # fused into a single weight tensor. - expert_params_mapping_fused = SharedFusedMoE.make_expert_params_mapping( + expert_params_mapping_fused = fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_up_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py index 945fcb61509b..d81df6f33737 100644 --- a/vllm/model_executor/models/longcat_flash.py +++ b/vllm/model_executor/models/longcat_flash.py @@ -46,7 +46,10 @@ from vllm.distributed import get_pp_group from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -622,7 +625,7 @@ def compute_logits( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return FusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/mimo_v2_flash.py b/vllm/model_executor/models/mimo_v2_flash.py index 0b466f16601a..0fe31c129e09 100644 --- a/vllm/model_executor/models/mimo_v2_flash.py +++ b/vllm/model_executor/models/mimo_v2_flash.py @@ -22,7 +22,10 @@ from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -511,7 +514,7 @@ def forward( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return FusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index 1b068c8e4c6c..c8b1eade44e5 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -37,7 +37,10 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( QKVParallelLinear, @@ -384,7 +387,7 @@ def forward( return hidden_states def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return FusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="w1", ckpt_down_proj_name="w2", diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index c182444f667d..cbfc254dda36 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -40,7 +40,10 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( QKVParallelLinear, @@ -364,7 +367,7 @@ def forward( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return FusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="w1", ckpt_down_proj_name="w2", diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index c8cbb5890ab3..d309c63b221e 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -39,7 +39,9 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.attention import MMEncoderAttention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, @@ -1072,7 +1074,7 @@ def _load_other_weights( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return FusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index b866f2bb5766..5702daecf49d 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -37,6 +37,7 @@ GateLinear, SharedFusedMoE, activation_without_mul, + fused_moe_make_expert_params_mapping, ) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( @@ -652,7 +653,7 @@ def _get_max_n_routed_experts(self) -> int: def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: if self.has_moe: # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = SharedFusedMoE.make_expert_params_mapping( + expert_params_mapping = fused_moe_make_expert_params_mapping( # - FusedMoe.w1 (aka gate_proj) should be up_proj since that's # what the activation is applied to # - FusedMoe.w3 (aka up_proj) should be ignored since we're diff --git a/vllm/model_executor/models/nemotron_h_mtp.py b/vllm/model_executor/models/nemotron_h_mtp.py index 12551d4254ed..fe737438c30f 100644 --- a/vllm/model_executor/models/nemotron_h_mtp.py +++ b/vllm/model_executor/models/nemotron_h_mtp.py @@ -11,7 +11,9 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.config.parallel import ParallelConfig -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -399,7 +401,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if getattr(self.config, "model_type", None) == "nemotron_h_puzzle": num_experts = self.config.mtp_n_routed_experts if num_experts is not None: - expert_params_mapping = FusedMoE.make_expert_params_mapping( + expert_params_mapping = fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="up_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index fe41c584ac2e..9de793a5f75e 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -32,7 +32,10 @@ from vllm.distributed.utils import split_tensor_along_last_dim from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( QKVParallelLinear, @@ -336,7 +339,7 @@ def forward( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return FusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 409370a77089..91f379847f04 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -44,7 +44,10 @@ Attention, StaticSinkAttention, ) -from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + SharedFusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -1149,7 +1152,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: ] has_experts = hasattr(self.config, "n_routed_experts") if has_experts: - expert_merge_mapping = SharedFusedMoE.make_expert_params_mapping( + expert_merge_mapping = fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/openpangu_mtp.py b/vllm/model_executor/models/openpangu_mtp.py index 91b454a4bc38..3a04ccdff5be 100644 --- a/vllm/model_executor/models/openpangu_mtp.py +++ b/vllm/model_executor/models/openpangu_mtp.py @@ -28,7 +28,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -147,7 +149,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1), ] - expert_params_mapping = FusedMoE.make_expert_params_mapping( + expert_params_mapping = fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 7d6083f202e6..5770420ce565 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -35,7 +35,10 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.linear import ( QKVParallelLinear, ReplicatedLinear, @@ -514,7 +517,7 @@ def forward( return hidden_states def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return FusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="w1", ckpt_down_proj_name="w2", diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index be28d3772d11..38b9ec3c4fd5 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -40,7 +40,10 @@ from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + SharedFusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -415,7 +418,7 @@ def forward( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return SharedFusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/qwen3_5_mtp.py b/vllm/model_executor/models/qwen3_5_mtp.py index 0f74b913ad05..447b660e0e60 100644 --- a/vllm/model_executor/models/qwen3_5_mtp.py +++ b/vllm/model_executor/models/qwen3_5_mtp.py @@ -12,7 +12,9 @@ from vllm.config import VllmConfig from vllm.distributed.parallel_state import get_pp_group from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -185,7 +187,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = FusedMoE.make_expert_params_mapping( + expert_params_mapping = fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index b62c765201ce..154ca07af0c6 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -43,7 +43,10 @@ from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + SharedFusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -508,7 +511,7 @@ def forward( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return SharedFusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 787a6e749f60..e994e437da56 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -38,7 +38,10 @@ fused_sigmoid_gating_delta_rule_update, ) from vllm.model_executor.layers.fla.ops.chunk import l2norm_fwd -from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + SharedFusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import ( GemmaRMSNorm as Qwen3NextRMSNorm, ) @@ -1419,7 +1422,7 @@ def forward( def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - return SharedFusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py index f3011f604427..56991bb1e2ae 100644 --- a/vllm/model_executor/models/qwen3_next_mtp.py +++ b/vllm/model_executor/models/qwen3_next_mtp.py @@ -11,7 +11,9 @@ from vllm.config import VllmConfig from vllm.distributed.parallel_state import get_pp_group from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -145,7 +147,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # Params for weights, fp8 weight scales, fp8 activation scales # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = FusedMoE.make_expert_params_mapping( + expert_params_mapping = fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/sarvam.py b/vllm/model_executor/models/sarvam.py index 8940dd611179..efec60f107f1 100644 --- a/vllm/model_executor/models/sarvam.py +++ b/vllm/model_executor/models/sarvam.py @@ -35,7 +35,11 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + MoERunner, + SharedFusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -352,7 +356,7 @@ def __init__( routed_scaling_factor=self.routed_scaling_factor, ) - def maybe_get_fused_moe(self) -> SharedFusedMoE: + def maybe_get_fused_moe(self) -> MoERunner: return self.experts def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -542,7 +546,7 @@ def forward( return hidden_states def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return SharedFusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", diff --git a/vllm/model_executor/models/step3p5.py b/vllm/model_executor/models/step3p5.py index 018f78956029..886605fc9790 100644 --- a/vllm/model_executor/models/step3p5.py +++ b/vllm/model_executor/models/step3p5.py @@ -23,8 +23,11 @@ from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul, SwigluStepAndMul from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE -from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE +from vllm.model_executor.layers.fused_moe import ( + MoERunner, + SharedFusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -635,7 +638,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: ] # New per-expert format: .moe.experts.E.gate_proj.weight_packed [out, in] - per_expert_mapping = FusedMoE.make_expert_params_mapping( + per_expert_mapping = fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", @@ -891,7 +894,7 @@ def set_eplb_state( ) -> None: for layer_idx, layer in enumerate(self.moe_layers): experts = layer.experts - assert isinstance(experts, FusedMoE) + assert isinstance(experts, MoERunner) # Register the expert weights. self.expert_weights.append(experts.get_expert_weights()) experts.set_eplb_state( diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index baacb528af0d..405117f84b92 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -25,7 +25,10 @@ from vllm.distributed import get_dp_group, get_ep_group from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.custom_op import CustomOp -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.models.interfaces import MixtureOfExperts from vllm.model_executor.models.utils import maybe_prefix from vllm.platforms import current_platform @@ -37,6 +40,7 @@ from vllm.config import VllmConfig +# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # --8<-- [start:transformers_fused_moe] @CustomOp.register("transformers_fused_moe") class TransformersFusedMoE(FusedMoE): @@ -179,7 +183,7 @@ def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: num_redundant_experts = self.parallel_config.eplb_config.num_redundant_experts for gate_proj, down_proj, up_proj in ckpt_names: expert_mapping.extend( - FusedMoE.make_expert_params_mapping( + fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name=gate_proj, ckpt_down_proj_name=down_proj, diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index 74229699105b..d58b20de04a9 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -11,9 +11,12 @@ import vllm.envs as envs from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank +from vllm.model_executor.layers.fused_moe import MoERunner from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M -from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEModularMethod +from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import ( + FusedMoEModularMethod, +) from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts, ) @@ -104,7 +107,7 @@ def _extract_data_from_fused_moe_module( """ Extract weights, weight scales and num_topk from FusedMoE module. """ - assert isinstance(m_, FusedMoE) + assert isinstance(m_, MoERunner) m = m_.routed_experts w13 = m.w13_weight w13_s = ( @@ -157,7 +160,7 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool: if not (envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM): return False - if not isinstance(module, FusedMoE): + if not isinstance(module, MoERunner): return False quant_method = module.routed_experts.quant_method diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 9b481d63990b..75cded8b7f0f 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -34,3 +34,16 @@ def length_from_prompt_token_ids_or_embeds( f" prompt_embeds={prompt_embeds_len}" ) return prompt_token_len + + +def is_moe_layer(module: torch.nn.Module) -> bool: + # TODO(bnell): Should use isinstance but can't. Maybe search for + # presence of quant_method.maybe_init_modular_kernel? + # return (hasattr(module, "quant_method") + # and hasattr(module.quant_method, "moe_kernel")) + return ( + module.__class__.__name__ == "FusedMoE" + or module.__class__.__name__ == "SharedFusedMoE" + or module.__class__.__name__ == "DefaultMoERunner" + or module.__class__.__name__ == "ChunkingMoERunner" + ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0bc19c0bba2d..496c78ff9fda 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -6621,10 +6621,13 @@ def init_routed_experts_capturer(self): self.routed_experts_initialized = True def _bind_routed_experts_capturer(self, capturer: RoutedExpertsCapturer) -> None: - from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoERouter + from vllm.model_executor.layers.fused_moe.layer import ( + FusedMoERouter, + MoERunner, + ) for module in self.compilation_config.static_forward_context.values(): - if isinstance(module, FusedMoE) and isinstance( + if isinstance(module, MoERunner) and isinstance( module.router, FusedMoERouter ): layer_id = module.layer_id From 57c7223e4698d6b1bc64326b9548bb03ac21d7b6 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 10 Apr 2026 22:49:01 +0000 Subject: [PATCH 099/191] lora/transformers tweaks Signed-off-by: Bill Nell --- vllm/lora/layers/fused_moe.py | 7 +++---- vllm/lora/model_manager.py | 4 ++-- vllm/lora/utils.py | 5 +++-- vllm/model_executor/models/transformers/moe.py | 11 ++++++++--- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 7fa95e3a7d99..5d3cddf4a3a4 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -15,7 +15,7 @@ from vllm.distributed.utils import divide from vllm.lora.layers.base import BaseLayerWithLoRA from vllm.lora.ops.triton_ops.utils import get_lora_op_configs -from vllm.model_executor.layers.fused_moe import FusedMoE, RoutedExperts +from vllm.model_executor.layers.fused_moe import FusedMoE, MoERunner, RoutedExperts from vllm.model_executor.layers.fused_moe.config import ( _get_config_dtype_str, ) @@ -43,12 +43,11 @@ # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX class FusedMoEWithLoRA(BaseLayerWithLoRA): - def __init__(self, base_layer: FusedMoE) -> None: + def __init__(self, base_layer: MoERunner) -> None: super().__init__() self.base_layer = base_layer - self._runner = base_layer._runner - assert not self.routed_experts.use_ep, ( + assert not self.base_layer.routed_experts.use_ep, ( "EP support for Fused MoE LoRA is not implemented yet." ) self.tp_size = get_tensor_model_parallel_world_size() diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index e5177055252a..84987b05053d 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -30,7 +30,7 @@ process_packed_modules_mapping, replace_submodule, ) -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import MoERunner from vllm.model_executor.models import ( SupportsLoRA, is_pooling_model, @@ -394,7 +394,7 @@ def _parent_module(module_name: str) -> str: parts = module_name.split(".")[-1] packed_moduled_lst = self.packed_modules_mapping.get(parts, []) # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - if isinstance(module, FusedMoE): + if isinstance(module, MoERunner): # packed_moduled_lst is used here to just determine whether to # instantiate FusedMoE3DWithLoRA or FusedMoEWithLoRA, and the # difference between these two LoRA layers is whether the diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 8501cb3607d7..a8f8af17dce2 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -35,7 +35,7 @@ RowParallelLinearWithShardedLoRA, VocabParallelEmbeddingWithLoRA, ) -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import FusedMoE, MoERunner from vllm.model_executor.layers.linear import LinearBase from vllm.model_executor.utils import get_moe_expert_mapping, get_packed_modules_mapping @@ -98,7 +98,8 @@ def get_lora_id(): # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX def is_moe_model(model: nn.Module) -> bool: """Checks if the model contains FusedMoE layers and warns the user.""" - if any(isinstance(module, FusedMoE) for module in model.modules()): + # Use MoERunner or RoutedExperts? + if any(isinstance(module, MoERunner) for module in model.modules()): logger.info_once("MoE model detected. Using fused MoE LoRA implementation.") return True return False diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index 405117f84b92..97cc2080642c 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -27,6 +27,7 @@ from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.fused_moe import ( FusedMoE, + MoERunner, fused_moe_make_expert_params_mapping, ) from vllm.model_executor.models.interfaces import MixtureOfExperts @@ -40,10 +41,9 @@ from vllm.config import VllmConfig -# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # --8<-- [start:transformers_fused_moe] @CustomOp.register("transformers_fused_moe") -class TransformersFusedMoE(FusedMoE): +class TransformersFusedMoE(MoERunner): """Custom FusedMoE for the Transformers modeling backend.""" # --8<-- [end:transformers_fused_moe] @@ -67,7 +67,12 @@ def custom_routing_function(hidden_states, gating_output, topk, renormalize): return topk_weights, topk_ids kwargs["custom_routing_function"] = custom_routing_function - super().__init__(*args, **kwargs) + self.runner = FusedMoE(*args, **kwargs) + + def __getattr__(self, name): + # Delegate attribute access to the originalr runner. This is only + # called when normal lookup (instance __dict__, class MRO) fails, + return getattr(self.runner, name) def forward( self, From 3fbde20b44e929486eb36e380a9dac8a7ba0c8d5 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 10 Apr 2026 23:16:26 +0000 Subject: [PATCH 100/191] lora tweaks. still not working Signed-off-by: Bill Nell --- vllm/lora/layers/fused_moe.py | 22 +++++++++++++--------- vllm/lora/utils.py | 4 ++-- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 5d3cddf4a3a4..53d7db4791e7 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -15,7 +15,7 @@ from vllm.distributed.utils import divide from vllm.lora.layers.base import BaseLayerWithLoRA from vllm.lora.ops.triton_ops.utils import get_lora_op_configs -from vllm.model_executor.layers.fused_moe import FusedMoE, MoERunner, RoutedExperts +from vllm.model_executor.layers.fused_moe import MoERunner, RoutedExperts from vllm.model_executor.layers.fused_moe.config import ( _get_config_dtype_str, ) @@ -62,6 +62,10 @@ def __init__(self, base_layer: MoERunner) -> None: def routed_experts(self) -> RoutedExperts: return self.base_layer.routed_experts + @property + def _shared_experts(self): # return type + return self.base_layer.shared_experts # ? + def _normalize_keys(self, config: dict[str, int | None]) -> dict[str, int | None]: normalized_config = {} for key, value in config.items(): @@ -82,7 +86,7 @@ def _get_lora_moe_configs( rank: int, num_slices: int, M: int, - layer: FusedMoE, + layer: RoutedExperts, top_k: int, config_dtype: str, ): @@ -114,13 +118,13 @@ def _get_lora_moe_configs( else: # fall back to the default config get_config_func = functools.partial( try_get_optimal_moe_lora_config, - w1_shape=layer.routed_experts.w13_weight.size(), - w2_shape=layer.routed_experts.w2_weight.size(), + w1_shape=layer.w13_weight.size(), + w2_shape=layer.w2_weight.size(), rank=rank, top_k=top_k, dtype=config_dtype, M=M, - block_shape=layer.routed_experts.quant_method.moe_quant_config.block_shape, + block_shape=layer.quant_method.moe_quant_config.block_shape, ) shrink_config = get_config_func( op_type=f"fused_moe_lora_{op_prefix}_shrink" @@ -206,7 +210,7 @@ def wrapper(*args, **kwargs): rank=max_lora_rank, num_slices=self._w13_slices, M=M, - layer=layer, + layer=layer.routed_experts, top_k=top_k, config_dtype=config_dtype, ) @@ -296,7 +300,7 @@ def wrapper(*args, **kwargs): rank=max_lora_rank, num_slices=1, M=M, - layer=layer, + layer=layer.routed_experts, top_k=top_k, config_dtype=config_dtype, ) @@ -616,7 +620,7 @@ def can_replace_layer( """Returns True if the layer can be replaced by this LoRA layer.""" # source_layer is FusedMoE or SharedFusedMoE - return isinstance(source_layer, FusedMoE) and len(packed_modules_list) == 2 + return isinstance(source_layer, MoERunner) and len(packed_modules_list) == 2 class FusedMoE3DWithLoRA(FusedMoEWithLoRA): @@ -778,4 +782,4 @@ def can_replace_layer( ) -> bool: """Returns True if the layer can be replaced by this LoRA layer.""" # source_layer is FusedMoE or SharedFusedMoE - return isinstance(source_layer, FusedMoE) and len(packed_modules_list) == 1 + return isinstance(source_layer, MoERunner) and len(packed_modules_list) == 1 diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index a8f8af17dce2..8c71bd664b45 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -35,7 +35,7 @@ RowParallelLinearWithShardedLoRA, VocabParallelEmbeddingWithLoRA, ) -from vllm.model_executor.layers.fused_moe import FusedMoE, MoERunner +from vllm.model_executor.layers.fused_moe import MoERunner from vllm.model_executor.layers.linear import LinearBase from vllm.model_executor.utils import get_moe_expert_mapping, get_packed_modules_mapping @@ -225,7 +225,7 @@ def get_supported_lora_modules(model: nn.Module) -> list[str]: if isinstance(module, (LinearBase,)): supported_lora_modules.add(name.split(".")[-1]) - if isinstance(module, (FusedMoE,)): + if isinstance(module, (MoERunner,)): supported_lora_modules.add(name.split(".")[-1]) return list(supported_lora_modules) From 028db8fc4a5476418cae413c4be532a98930ba3c Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 22 Apr 2026 18:34:07 +0000 Subject: [PATCH 101/191] collapse runner Signed-off-by: Bill Nell --- .../modular_kernel_tools/parallel_utils.py | 2 +- tests/kernels/moe/test_moe_layer.py | 8 - .../layers/fused_moe/__init__.py | 6 + .../layers/fused_moe/cutlass_moe.py | 1 - .../layers/fused_moe/fused_moe_method_base.py | 10 +- .../fused_moe/fused_moe_modular_method.py | 3 +- vllm/model_executor/layers/fused_moe/layer.py | 21 +- .../layers/fused_moe/modular_kernel.py | 28 +- .../layers/fused_moe/oracle/fp8.py | 8 - .../layers/fused_moe/oracle/mxfp4.py | 5 - .../layers/fused_moe/oracle/nvfp4.py | 5 - .../layers/fused_moe/oracle/unquantized.py | 4 - .../layers/fused_moe/routed_experts.py | 3 + .../fused_moe/runner/chunking_moe_runner.py | 225 ----- .../fused_moe/runner/default_moe_runner.py | 163 ---- .../layers/fused_moe/runner/moe_runner.py | 851 ++++++++++++++++-- .../fused_moe/runner/moe_runner_base.py | 828 ----------------- .../fused_moe/runner/moe_runner_factory.py | 65 -- .../fused_moe/runner/moe_runner_interface.py | 139 +++ .../layers/fused_moe/runner/shared_experts.py | 6 +- .../layers/fused_moe/shared_fused_moe.py | 20 +- .../fused_moe/unquantized_fused_moe_method.py | 12 +- .../layers/quantization/awq_marlin.py | 8 +- .../layers/quantization/bitsandbytes.py | 8 +- .../compressed_tensors_moe.py | 14 +- .../layers/quantization/experts_int8.py | 2 + .../model_executor/layers/quantization/fp8.py | 4 +- .../layers/quantization/gguf.py | 12 +- .../layers/quantization/gptq_marlin.py | 8 +- .../layers/quantization/modelopt.py | 20 +- .../layers/quantization/moe_wna16.py | 2 + .../layers/quantization/mxfp4.py | 8 +- .../layers/quantization/quark/quark_moe.py | 4 + vllm/utils/__init__.py | 16 +- 34 files changed, 1052 insertions(+), 1467 deletions(-) delete mode 100644 vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py delete mode 100644 vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py delete mode 100644 vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py delete mode 100644 vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py create mode 100644 vllm/model_executor/layers/fused_moe/runner/moe_runner_interface.py diff --git a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py index a998d386c6f7..dfb24d96649c 100644 --- a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py +++ b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py @@ -77,8 +77,8 @@ def _worker_parallel_launch( *args: Any, ) -> None: rank = node_rank * world_local_size + local_rank - torch.accelerator.set_device_index(local_rank) device = torch.device("cuda", local_rank) + torch.accelerator.set_device_index(device) torch.distributed.init_process_group( backend="cpu:gloo,cuda:nccl", init_method=init_method, diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index c10317b4e853..06c469c4e15f 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -1125,10 +1125,6 @@ def _test_body_eplb( routed_output_transform=routed_output_transform, ) - # Necessary? - # if moe_layer._expert_map is not None: - # moe_layer._expert_map = moe_layer._expert_map.to(device) - # All ranks must generate the same permutation initial_indices = torch.arange(num_experts, dtype=torch.long) shuffled_indices = initial_indices[torch.randperm(num_experts)] @@ -1302,10 +1298,6 @@ def _run_one_config( routed_output_transform=routed_output_transform, ) - # Necessary? - # if moe_layer._expert_map is not None: - # moe_layer._expert_map = moe_layer._expert_map.to(device) - num_tokens = m num_tokens_across_dp = torch.tensor( [num_tokens] * world_size, diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 0c45d22696d2..dec1c789a627 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -11,6 +11,7 @@ ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, + FusedMoEQuantConfig, RoutingMethodType, ) from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( @@ -39,6 +40,9 @@ from vllm.model_executor.layers.fused_moe.runner.moe_runner_factory import ( create_moe_runner, ) +from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( + SharedExperts, +) from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( UnquantizedFusedMoEMethod, @@ -65,6 +69,7 @@ def get_config() -> dict[str, Any] | None: "FusedMoE", "FusedMoEActivationFormat", "FusedMoEConfig", + "FusedMoEQuantConfig", "FusedMoEExpertsModular", "FusedMoEMethodBase", "FusedMoEPrepareAndFinalizeModular", @@ -75,6 +80,7 @@ def get_config() -> dict[str, Any] | None: "MoERunner", "RoutedExperts", "RoutingMethodType", + "SharedExperts", "SharedFusedMoE", "UnquantizedFusedMoEMethod", "activation_without_mul", diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 43082b3675a6..54e2128789ea 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -1194,7 +1194,6 @@ def cutlass_moe_w4a8_fp8( quant_config=quant_config, group_size=group_size, ), - shared_experts=None, inplace=False, ) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py index 82ad9e812651..63f2cbf8b585 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py @@ -16,6 +16,9 @@ FusedMoEExpertsModular, FusedMoEPrepareAndFinalizeModular, ) +from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( + SharedExperts, +) from vllm.model_executor.layers.quantization.base_config import ( QuantizeMethodBase, ) @@ -42,10 +45,12 @@ def supports_internal_mk(self) -> bool: return self.moe_kernel is not None @property - def mk_owns_shared_expert(self) -> bool: + def mk_can_overlap_shared_experts(self) -> bool: # NOTE(rob): temporary attribute to indicate support for # completed migration to the new internal MK interface. - return self.moe_kernel is not None and self.moe_kernel.owns_shared_experts + return ( + self.moe_kernel is not None and self.moe_kernel.can_overlap_shared_experts + ) @abstractmethod def create_weights( @@ -133,6 +138,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: """ diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index 321facc20969..7f7baea60dea 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -61,7 +61,6 @@ def make( FusedMoEKernel( prepare_finalize, old_quant_method.select_gemm_impl(prepare_finalize, routed_experts), - shared_experts=shared_experts, inplace=inplace, ), ) @@ -96,6 +95,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert self.moe_kernel is not None @@ -109,5 +109,6 @@ def apply( global_num_experts=layer.global_num_experts, apply_router_weight_on_input=layer.apply_router_weight_on_input, expert_map=None if self.disable_expert_map else layer.expert_map, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 2d533f1e02cf..4a8cd5df9178 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -30,9 +30,6 @@ from vllm.model_executor.layers.fused_moe.runner.moe_runner import ( MoERunner, ) -from vllm.model_executor.layers.fused_moe.runner.moe_runner_factory import ( - create_moe_runner, -) from vllm.model_executor.layers.fused_moe.utils import ( disable_inplace, ) @@ -117,6 +114,7 @@ def determine_expert_counts( return global_num_experts, logical_num_experts, num_fused_shared_experts +# TODO: rename this def FusedMoE( num_experts: int, # Global number of experts top_k: int, @@ -152,7 +150,8 @@ def FusedMoE( apply_scale_to_output: bool = False, zero_expert_type: str | None = None, ) -> MoERunner: - """FusedMoE layer for MoE models. + # TODO update comment + """FusedMoE layer builder for MoE models. This layer contains both MergedColumnParallel weights (gate_up_proj / w13) and RowParallelLinear weights (down_proj/ w2). @@ -281,7 +280,7 @@ def FusedMoE( is_lora_enabled=vllm_config.lora_config is not None, activation=moe_activation, device=vllm_config.device_config.device, - routing_method=router.routing_method_type, + routing_method=router.routing_method_type, # Not ideal # TODO: in_dtype == out_dtype? disable_inplace=disable_inplace() or shared_experts is not None, ) @@ -312,13 +311,7 @@ def FusedMoE( activation=moe_activation, ) - # TODO(bnell): this needs to be stored as a parameter for weight loading. - # ditch this eventually. - - # Storing the runner in the FusedMoE is an intermediate state, eventually - # the runner will own the FusedMoE layer and provide the execution interface - # for MoE ops. - runner = create_moe_runner( + runner = MoERunner( layer_name=layer_name, moe_config=moe_config, router=router, @@ -332,10 +325,6 @@ def FusedMoE( routed_scaling_factor=routed_scaling_factor, ) - # HACK XXXXXXXXXXXXXXXXXXXXXXXX - # This is needed by various _setup_kernels in quant methods. - routed_experts.shared_experts = runner.shared_experts - # For smuggling this layer into the fused moe custom op register_layer_for_moe_forward_op(vllm_config, runner) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 7a5f96a5b07b..8284aee3f213 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -996,17 +996,10 @@ def __init__( self, prepare_finalize: FusedMoEPrepareAndFinalizeModular, fused_experts: FusedMoEExpertsModular, - shared_experts: SharedExperts | None = None, inplace: bool = False, ): self.prepare_finalize = prepare_finalize self.fused_experts = fused_experts - # Only accept shared experts if they can be run w/async. - # The MoERunner/SharedExperts class will coordinate with the MK to ensure - # that the SharedExperts are executed only once. - self.shared_experts = ( - shared_experts if prepare_finalize.supports_async() else None - ) self.inplace = inplace moe_parallel_config = fused_experts.moe_config.moe_parallel_config self.moe_parallel_config = moe_parallel_config @@ -1081,11 +1074,13 @@ def _allocate_buffers( def _maybe_apply_shared_experts( self, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ): - if self.shared_experts is not None: + if shared_experts is not None and self.prepare_finalize.supports_async(): + assert not self.inplace assert shared_experts_input is not None - self.shared_experts( + shared_experts( shared_experts_input, SharedExpertsOrder.MK_INTERNAL_OVERLAPPED, ) @@ -1249,6 +1244,7 @@ def _finalize( topk_weights: torch.Tensor, topk_ids: torch.Tensor, apply_router_weight_on_input: bool, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: """ @@ -1256,6 +1252,7 @@ def _finalize( that handles DBO, async and shared expert overlap. Args: + shared_experts: Optional shared_experts_input: Optional separate input for shared experts. When latent MoE is used, hidden_states is the latent-projected tensor (smaller dimension) used by routed experts, while @@ -1282,7 +1279,7 @@ def _finalize( apply_router_weight_on_input, self.fused_experts.finalize_weight_and_reduce_impl(), ) - self._maybe_apply_shared_experts(shared_experts_input) + self._maybe_apply_shared_experts(shared_experts, shared_experts_input) # TODO(lucas): refactor this in the alternative schedules followup # currently unpack if we have hook + receiver pair or just @@ -1318,6 +1315,7 @@ def apply( global_num_experts: int = -1, expert_map: torch.Tensor | None = None, apply_router_weight_on_input: bool = False, + shared_experts: SharedExperts | None = None, shared_experts_input: torch.Tensor | None = None, ) -> torch.Tensor: """ @@ -1348,7 +1346,6 @@ def apply( - torch.Tensor: The output tensor after applying the MoE layer. """ if self.inplace: - assert self.shared_experts is None assert not disable_inplace() output = hidden_states else: @@ -1390,6 +1387,7 @@ def apply( topk_weights, topk_ids, apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) @@ -1462,7 +1460,6 @@ def __init__( self, prepare_finalize: FusedMoEPrepareAndFinalize, fused_experts: FusedMoEExperts, - shared_experts: SharedExperts | None = None, inplace: bool = False, ): super().__init__() @@ -1475,7 +1472,6 @@ def __init__( self.impl = FusedMoEKernelModularImpl( prepare_finalize, fused_experts, - shared_experts, inplace, ) @@ -1498,9 +1494,9 @@ def __init__( self._post_init_setup() @property - def owns_shared_experts(self) -> bool: + def can_overlap_shared_experts(self) -> bool: if isinstance(self.impl, FusedMoEKernelModularImpl): - return self.impl.shared_experts is not None + return self.impl.prepare_finalize.supports_async() else: return False @@ -1583,6 +1579,7 @@ def apply( global_num_experts: int, expert_map: torch.Tensor | None, apply_router_weight_on_input: bool, + shared_experts: SharedExperts | None = None, shared_experts_input: torch.Tensor | None = None, ) -> torch.Tensor: assert isinstance(self.impl, FusedMoEKernelModularImpl) @@ -1596,5 +1593,6 @@ def apply( global_num_experts=global_num_experts, expert_map=expert_map, apply_router_weight_on_input=apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py index 3d9a499027cd..9e6e277c66e6 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py +++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py @@ -18,9 +18,6 @@ fp8_w8a8_moe_quant_config, fp8_w8a16_moe_quant_config, ) -from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( - SharedExperts, -) from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( FlashinferMoeBackend, get_flashinfer_moe_backend, @@ -548,7 +545,6 @@ def make_fp8_moe_kernel( experts_cls: type[mk.FusedMoEExperts], fp8_backend: Fp8MoeBackend, routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, - shared_experts: SharedExperts | None = None, ) -> mk.FusedMoEKernel: # Create Prepare/Finalize. prepare_finalize = maybe_make_prepare_finalize( @@ -578,13 +574,9 @@ def make_fp8_moe_kernel( quant_config=moe_quant_config, ) - # NOTE(rob): we only want the mk to control the shared_expert - # if using all2all (for SBO). bnell is making this explicit in - # the new MoE runner class. kernel = mk.FusedMoEKernel( prepare_finalize, experts, - shared_experts=shared_experts, inplace=( not moe_config.disable_inplace and fp8_backend != Fp8MoeBackend.FLASHINFER_CUTLASS diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py index 77a53a6c3e5f..d9566bb7a48e 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py +++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py @@ -854,11 +854,6 @@ def make_mxfp4_moe_kernel( kernel = mk.FusedMoEKernel( prepare_finalize, experts, - shared_experts=( - shared_experts - if moe_config.moe_parallel_config.use_deepep_ll_kernels - else None - ), inplace=( not moe_config.disable_inplace and mxfp4_backend not in TRTLLM_BACKENDS ), diff --git a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py index d946c5eb53c5..960a12c77563 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py +++ b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py @@ -17,9 +17,6 @@ nvfp4_moe_quant_config, nvfp4_w4a16_moe_quant_config, ) -from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( - SharedExperts, -) from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( prepare_nvfp4_moe_layer_for_fi_or_cutlass, ) @@ -389,7 +386,6 @@ def make_nvfp4_moe_kernel( moe_config: FusedMoEConfig, experts_cls: type[mk.FusedMoEExperts], routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, - shared_experts: SharedExperts | None = None, ) -> mk.FusedMoEKernel: # Create Prepare/Finalize. prepare_finalize = maybe_make_prepare_finalize( @@ -425,7 +421,6 @@ def make_nvfp4_moe_kernel( kernel = mk.FusedMoEKernel( prepare_finalize, experts, - shared_experts=shared_experts, inplace=False, ) diff --git a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py index 926819592c3a..9c31da10dd94 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py +++ b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py @@ -224,7 +224,6 @@ def make_unquantized_moe_kernel( moe_config=moe_config, quant_config=quant_config, ), - shared_experts=None, inplace=False, ) @@ -239,7 +238,6 @@ def make_unquantized_moe_kernel( moe_config=moe_config, quant_config=quant_config, ), - shared_experts=None, inplace=not moe_config.disable_inplace, ) elif backend == UnquantizedMoeBackend.TRITON: @@ -251,7 +249,6 @@ def make_unquantized_moe_kernel( moe_config=moe_config, quant_config=quant_config, ), - shared_experts=None, inplace=not moe_config.disable_inplace, ) elif backend == UnquantizedMoeBackend.XPU: @@ -263,7 +260,6 @@ def make_unquantized_moe_kernel( moe_config=moe_config, quant_config=quant_config, ), - shared_experts=None, inplace=not moe_config.disable_inplace, ) return kernel diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index b915272db2ed..d3f17dd6cee0 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -982,6 +982,7 @@ def forward( topk_ids: torch.Tensor | None = None, router_logits: torch.Tensor | None = None, shared_experts_input: torch.Tensor | None = None, + shared_experts: torch.nn.Module | None = None, ) -> torch.Tensor: """ Execute routed experts using the quantization method's apply function. @@ -1004,6 +1005,7 @@ def forward( quant_method = self.quant_method if quant_method.is_monolithic: + assert shared_experts is None # Monolithic kernels handle routing internally return quant_method.apply_monolithic( layer=self, # Pass RoutedExperts as layer @@ -1017,6 +1019,7 @@ def forward( x=x, topk_weights=topk_weights, topk_ids=topk_ids, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py deleted file mode 100644 index f66db463bd52..000000000000 --- a/vllm/model_executor/layers/fused_moe/runner/chunking_moe_runner.py +++ /dev/null @@ -1,225 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -from vllm.forward_context import ( - get_forward_context, -) -from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( - FusedMoEMethodBase, -) -from vllm.model_executor.layers.fused_moe.runner.moe_runner_base import MoERunnerBase -from vllm.utils.math_utils import cdiv -from vllm.v1.worker.ubatching import dbo_current_ubatch_id -from vllm.v1.worker.workspace import current_workspace_manager - -logger = init_logger(__name__) - - -class ChunkingMoERunner(MoERunnerBase): - """ - MoE runner wrapper that adds chunked processing to any MoERunnerBase. - - This runner wraps an inner MoERunnerBase and overrides _forward_impl to - process large batches by breaking them into smaller chunks. Each chunk - is delegated to the inner runner's _forward_impl, making chunking - composable with any runner implementation. - - All MoERunnerBase state (moe_config, router, quant_method, etc.) is - transparently delegated to the inner runner via __getattr__. - ChunkingMoERunner only owns chunking-specific state: the pre-allocated - workspace buffers. - - Key behaviors: - - Pre-allocates workspace tensors for CUDA graph compatibility - - Processes chunks via inner._forward_impl per chunk - """ - - def __init__(self, inner: MoERunnerBase, **kwargs): - super().__init__(**kwargs) # this is not ideal - - # TODO(bnell): fix this - self._shared_experts = inner._shared_experts - - # Assert that _maybe_dispatch/_maybe_combine will be nops. - assert inner.moe_config.pcp_size == 1 - - # Skip MoERunnerBase.__init__ — all state is delegated to inner - # via __getattr__. Only chunking-specific state lives here. - self._inner = inner - - # Pre-allocated staging buffers. These need to exist ahead of time - # due to CUDA graph construction needing fixed buffer addresses. - self.batched_hidden_states, self.batched_router_logits = ( - self._init_dp_chunking() - ) - - # def __getattr__(self, name): - # # Delegate attribute access to the inner runner. This is only - # # called when normal lookup (instance __dict__, class MRO) fails, - # # so ChunkingMoERunner's own attributes and methods take priority. - # return getattr(self._inner, name) - - def _replace_quant_method(self, quant_method: FusedMoEMethodBase): - self.routed_experts.quant_method = quant_method - self._inner._replace_quant_method(quant_method) - assert self.shared_experts == self._inner.shared_experts - - def _init_dp_chunking(self) -> list[torch.Tensor]: - states_shape: tuple[int, ...] - logits_shape: tuple[int, ...] - - moe = self.moe_config - - if self.enable_dbo: - states_shape = (2, moe.max_num_tokens, self.moe_config.hidden_dim) - logits_shape = (2, moe.max_num_tokens, self.moe_config.num_logical_experts) - else: - states_shape = (moe.max_num_tokens, self.moe_config.hidden_dim) - logits_shape = (moe.max_num_tokens, self.moe_config.num_logical_experts) - - # Does this need some kind of profiling run check like modular_kernel.py? - return current_workspace_manager().get_simultaneous( - (states_shape, moe.in_dtype), - (logits_shape, moe.router_logits_dtype), - ) - - def _allocate_dp_chunking_outputs( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - ) -> tuple[torch.Tensor | None, torch.Tensor]: - # Assert the inputs are of the proper type and shape. - assert self.batched_hidden_states is not None - assert self.batched_router_logits is not None - - assert self.batched_hidden_states.dtype == hidden_states.dtype, ( - f"{self.batched_hidden_states.dtype} == {hidden_states.dtype}" - ) - assert self.batched_router_logits.dtype == router_logits.dtype, ( - f"{self.batched_router_logits.dtype} == {router_logits.dtype}" - ) - - # Check size compatibility. - assert self.batched_hidden_states.size(-1) == hidden_states.size(-1) - assert self.batched_router_logits.size(-1) == router_logits.size(-1) - - final_fused_hidden_states = torch.empty_like(hidden_states) - if self.shared_experts is not None: - final_shared_hidden_states = torch.empty_like(hidden_states) - else: - final_shared_hidden_states = None - - return final_shared_hidden_states, final_fused_hidden_states - - def _slice_and_copy_input( - self, - out_slice: torch.Tensor, - orig: torch.Tensor | None, - start: int, - end: int, - ) -> torch.Tensor: - assert orig is not None - slice_size = end - start - orig_slice = orig[start:end, :] - if self.enable_dbo: - assert out_slice.dim() == 3 - batch_buffer_idx = dbo_current_ubatch_id() - out_slice = out_slice[batch_buffer_idx, :] - - assert out_slice.size(0) >= slice_size - out_slice = out_slice[:slice_size, :] - out_slice.copy_(orig_slice, non_blocking=True) - return out_slice - - def _forward_impl( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - final_shared_hidden_states, final_fused_hidden_states = ( - self._allocate_dp_chunking_outputs(hidden_states, router_logits) - ) - - ctx = get_forward_context() - # flashinfer_cutlass_kernels can handle: optional DP + TP/EP - max_tokens_across_dispatchers = ctx.dp_metadata.max_tokens_across_dp_cpu - moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens - - # If the input to the MoE is sequence parallel then divide by sp_size - # to find the maximum number of tokens for any individual dispatcher. - if self.moe_config.is_sequence_parallel: - max_tokens_across_dispatchers = cdiv( - max_tokens_across_dispatchers, self.moe_config.sp_size - ) - - num_tokens = hidden_states.size(0) - for chunk_idx, chunk_start_ in enumerate( - range(0, max_tokens_across_dispatchers, moe_dp_chunk_size_per_rank) - ): - chunk_start = chunk_start_ - chunk_end = min( - chunk_start + moe_dp_chunk_size_per_rank, max_tokens_across_dispatchers - ) - # clamp start and end - chunk_start = min(chunk_start, num_tokens - 1) - chunk_end = min(chunk_end, num_tokens) - chunk_sizes = ctx.dp_metadata.chunked_sizes( - self.moe_config.sp_size, moe_dp_chunk_size_per_rank, chunk_idx - ) - with chunk_sizes: - hidden_states_chunk = self._slice_and_copy_input( - self.batched_hidden_states, - hidden_states, - chunk_start, - chunk_end, - ) - - router_logits_chunk = self._slice_and_copy_input( - self.batched_router_logits, - router_logits, - chunk_start, - chunk_end, - ) - - shared_experts_input_chunk = ( - shared_experts_input[chunk_start:chunk_end, :] - if shared_experts_input is not None - else None - ) - - # Delegate per-chunk computation to the inner runner. - chunk_result = self._inner._forward_impl( - hidden_states=hidden_states_chunk, - router_logits=router_logits_chunk, - shared_experts_input=shared_experts_input_chunk, - ) - - # Store outputs - # TODO(bnell): document when chunk_start >= num_tokens - if chunk_start < num_tokens: - if self.shared_experts is not None: - assert isinstance(chunk_result, tuple) - shared_output_chunk, hidden_states_chunk = chunk_result - final_fused_hidden_states[chunk_start:chunk_end, :].copy_( - hidden_states_chunk, non_blocking=True - ) - assert shared_output_chunk is not None - assert final_shared_hidden_states is not None - final_shared_hidden_states[chunk_start:chunk_end, :].copy_( - shared_output_chunk, non_blocking=True - ) - else: - assert isinstance(chunk_result, torch.Tensor) - final_fused_hidden_states[chunk_start:chunk_end, :].copy_( - chunk_result, non_blocking=True - ) - - if self.shared_experts is None: - return final_fused_hidden_states - else: - assert final_shared_hidden_states is not None - return (final_shared_hidden_states, final_fused_hidden_states) diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py deleted file mode 100644 index b8e94708c678..000000000000 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ /dev/null @@ -1,163 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -from vllm.distributed import ( - get_ep_group, - get_pcp_group, -) -from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEConfig, -) -from vllm.model_executor.layers.fused_moe.routed_experts import ( - RoutedExperts, -) -from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( - FusedMoERouter, -) -from vllm.model_executor.layers.fused_moe.runner.moe_runner_base import MoERunnerBase - -logger = init_logger(__name__) - - -class DefaultMoERunner(MoERunnerBase): - """ - Standard MoE runner implementation for executing Mixture of Experts layers. - - This is the primary concrete implementation of MoE execution logic, providing - comprehensive support for standard MoE operations. It handles: - - Expert routing and token dispatching using various routing strategies - - Shared experts computation with optional parallel execution using CUDA streams - - Tensor model parallel and expert parallel operations - - Multiple quantization methods and optimized kernel selection - - Both monolithic and decomposed expert execution paths - - Integration with various parallel execution modes (TP, EP, DP) - - The runner orchestrates the complete MoE forward pass including routing tokens - to experts, executing expert computations in parallel, and combining results. - It supports advanced features like overlapped execution of shared experts, - optimized kernels for different parallel configurations, and seamless - integration with vLLM's distributed execution framework. - - This implementation is suitable for most standard MoE use cases. For specialized - scenarios like large batch chunking, alternative runners like ChunkingMoERunner - may be more appropriate. - - Eventually, this class may be split into more specialized implementations - for different configurations (e.g., with/without shared experts, gates, etc.). - """ - - def __init__( - self, - layer_name: str, - moe_config: FusedMoEConfig, - router: FusedMoERouter, - routed_input_transform: torch.nn.Module | None, - gate: torch.nn.Module | None, - shared_experts: torch.nn.Module | None, - routed_experts: RoutedExperts, - enable_dbo: bool, - routed_output_transform: torch.nn.Module | None = None, - apply_scale_to_output: bool = False, - routed_scaling_factor: float = 1.0, - ): - super().__init__( - layer_name, - moe_config, - router, - routed_input_transform, - gate, - shared_experts, - routed_experts, - enable_dbo, - routed_output_transform=routed_output_transform, - apply_scale_to_output=apply_scale_to_output, - routed_scaling_factor=routed_scaling_factor, - ) - - @property - def do_naive_dispatch_combine(self) -> bool: - return ( - self.moe_config.dp_size > 1 - and not self.routed_experts.quant_method.supports_internal_mk - ) - - def _maybe_dispatch( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - # For naive dispatch/combine Dp/Ep, dispatch the hidden states and - # router logits to all experts. - # NOTE: this will be removed once all kernels are migrated into the - # MoEKernel framework. - if self.do_naive_dispatch_combine: - hidden_states, router_logits = get_ep_group().dispatch_router_logits( - hidden_states, - router_logits, - self.moe_config.is_sequence_parallel, - ) - - # NOTE: Similar with DP, PCP also needs dispatch and combine. For - # simplicity, AgRsAll2All was added separately for PCP here. Maybe - # we should modify All2AllManager abstraction to better support PCP. - if self.moe_config.pcp_size > 1: - hidden_states = get_pcp_group().all_gather( - hidden_states, - dim=0, - ) - router_logits = get_pcp_group().all_gather( - router_logits, - dim=0, - ) - - return hidden_states, router_logits - - def _maybe_combine( - self, - shared_output: torch.Tensor | None, - hidden_states: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]: - if self.do_naive_dispatch_combine: - hidden_states = get_ep_group().combine( - hidden_states, self.moe_config.is_sequence_parallel - ) - - if self.moe_config.pcp_size > 1: - hidden_states = get_pcp_group().reduce_scatter( - hidden_states, - dim=0, - ) - - if self.shared_experts is not None: - assert shared_output is not None - return shared_output, hidden_states - else: - return hidden_states - - def _forward_impl( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - # TODO(bnell): parts of the dispatch/combine steps will go away once - # #32567 lands and the remaining kernels are made MKs. The PCP - # code will probably remain - hidden_states, router_logits = self._maybe_dispatch( - hidden_states, - router_logits, - ) - - shared_output, hidden_states = self._apply_quant_method( - hidden_states=hidden_states, - router_logits=router_logits, - shared_experts_input=shared_experts_input, - ) - - return self._maybe_combine( - shared_output, - hidden_states, - ) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index 0dc7dcdd6810..82c1119e721a 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -1,134 +1,865 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from abc import abstractmethod -from collections.abc import Iterable +from collections.abc import Callable, Iterable +from contextlib import nullcontext +from typing import TYPE_CHECKING import torch +import torch.nn.functional as F from vllm.config.parallel import ExpertPlacementStrategy +from vllm.distributed import ( + get_ep_group, + get_pcp_group, + tensor_model_parallel_all_reduce, +) +from vllm.forward_context import ( + ForwardContext, + get_forward_context, + is_forward_context_available, +) +from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.activation import MoEActivation +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, +) from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, ) +from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import ( + FusedMoEModularMethod, +) +from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts +from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( + FusedMoERouter, +) +from vllm.model_executor.layers.fused_moe.router.zero_expert_router import ( + ZeroExpertRouter, +) +from vllm.model_executor.layers.fused_moe.runner.moe_runner_interface import ( + MoERunnerInterface, +) from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( SharedExperts, + SharedExpertsOrder, +) +from vllm.platforms import current_platform +from vllm.utils.torch_utils import ( + HAS_OPAQUE_TYPE, + ModuleName, + direct_register_custom_op, ) +logger = init_logger(__name__) + + +def get_layer_from_name(layer_name: str) -> MoERunnerInterface: + forward_context: ForwardContext = get_forward_context() + if layer_name == "from_forward_context": + all_moe_layers = forward_context.all_moe_layers + assert all_moe_layers is not None + moe_layer_index = forward_context.moe_layer_index + if moe_layer_index >= len(all_moe_layers): + raise AssertionError( + "We expected the number of MOE layers in `all_moe_layers` " + "to be equal to the number of " + "{vllm.moe_forward, vllm.moe_forward_shared} calls." + ) + layer_name = all_moe_layers[moe_layer_index] + forward_context.moe_layer_index += 1 + layer = forward_context.no_compile_layers[layer_name] + assert isinstance(layer, MoERunnerInterface) + return layer + + +# On torch >= 2.11, layer_name is a hoisted ModuleName opaque object; +# on older versions it remains a plain str. +if TYPE_CHECKING: + from typing import TypeAlias + + _layer_name_type: TypeAlias = str | ModuleName +else: + _layer_name_type = ModuleName if HAS_OPAQUE_TYPE else str + + +def _resolve_layer_name(layer_name: str | ModuleName) -> str: + return layer_name.value if isinstance(layer_name, ModuleName) else layer_name + + +# Note: _moe_forward and _moe_forward_shared should not contain any +# implementation details, They should merely pass along control to +# the runner's '_forward_impl' method. +# These functions should never be called directly since they do not +# include all the functionality of the MoE layer. +def _moe_forward( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + layer_name: _layer_name_type, +) -> torch.Tensor: + layer = get_layer_from_name(_resolve_layer_name(layer_name)) + return layer._forward_impl( + hidden_states, + router_logits, + shared_experts_input, + ) + + +def _moe_forward_fake( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + layer_name: _layer_name_type, +) -> torch.Tensor: + return torch.empty_like(hidden_states) + -class MoERunner(torch.nn.Module): +def _moe_forward_shared( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + layer_name: _layer_name_type, +) -> tuple[torch.Tensor, torch.Tensor]: + layer = get_layer_from_name(_resolve_layer_name(layer_name)) + return layer._forward_impl( + hidden_states, + router_logits, + shared_experts_input, + ) + + +def _moe_forward_shared_fake( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + layer_name: _layer_name_type, +) -> tuple[torch.Tensor, torch.Tensor]: + # Output shapes: + # - fused_out: same as hidden_states (routed experts use transformed size) + # - shared_out: same as shared_experts_input if provided, else same as + # hidden_states + # (For latent MoE: shared experts use original hidden_size, not latent size) + fused_out = torch.empty_like(hidden_states) + if shared_experts_input is not None: + shared_out = torch.empty_like(shared_experts_input) + else: + shared_out = torch.empty_like(hidden_states) + return shared_out, fused_out + + +direct_register_custom_op( + op_name="moe_forward", + op_func=_moe_forward, + mutates_args=["hidden_states"], + fake_impl=_moe_forward_fake, + tags=(torch.Tag.needs_fixed_stride_order,), +) + + +direct_register_custom_op( + op_name="moe_forward_shared", + op_func=_moe_forward_shared, + fake_impl=_moe_forward_shared_fake, + tags=(torch.Tag.needs_fixed_stride_order,), +) + + +def _unpack( + result: torch.Tensor | tuple[torch.Tensor, torch.Tensor], +) -> tuple[torch.Tensor | None, torch.Tensor]: + if isinstance(result, tuple): + return result + else: + return (None, result) + + +class MoERunner(MoERunnerInterface): """ - Abstract base class for Mixture of Experts (MoE) runners. + Abstract base class providing common functionality for MoE runner implementations. + + This class serves as the foundation for concrete MoE runner implementations by + providing shared state management and common utilities. It handles: + - Common initialization and configuration management + - Shared expert output reduction logic for tensor parallel scenarios + - Base methods for tensor model parallel reductions + - Common properties and utility functions used across different runner types - This class defines the interface that all MoE runner implementations must follow. - MoE runners are responsible for executing the forward pass of MoE layers, handling - expert routing, and managing tensor parallel operations. + Concrete subclasses must implement the abstract methods to define their specific + execution strategies, such as standard execution, chunked processing, or other + specialized approaches. The base class provides the infrastructure while + allowing flexibility in the actual MoE computation implementation. + + Key abstract methods that subclasses must implement: + - _forward_impl: The core MoE computation logic specific to each runner type """ - def __init__(self): + def __init__( + self, + layer_name: str, + moe_config: FusedMoEConfig, + router: FusedMoERouter, + routed_input_transform: torch.nn.Module | None, + gate: torch.nn.Module | None, + shared_experts: torch.nn.Module | None, + routed_experts: RoutedExperts, + enable_dbo: bool, + routed_output_transform: torch.nn.Module | None = None, + apply_scale_to_output: bool = False, + routed_scaling_factor: float = 1.0, + ): super().__init__() - # HACK - self._already_called_process_weights_after_loading = True + self.moe_config = moe_config + self.router = router + self.routed_input_transform = routed_input_transform + self.routed_output_transform = routed_output_transform + self.gate = gate + self._shared_experts: SharedExperts | None = None + if shared_experts is not None: + self._shared_experts = SharedExperts( + shared_experts, + moe_config=moe_config, + mk_can_overlap_shared_experts=routed_experts.quant_method.mk_can_overlap_shared_experts, + ) + self.routed_experts = routed_experts + self.enable_dbo = enable_dbo + self.enable_eplb = moe_config.moe_parallel_config.enable_eplb + self.apply_scale_to_output = ( + apply_scale_to_output and routed_scaling_factor != 1.0 + ) + self.routed_scaling_factor = routed_scaling_factor + + # Needed for string -> MoERunner layer lookup in custom ops. + self.layer_name = layer_name + + self._forward_entry = self._select_forward() + + @property + def is_internal_router(self) -> bool: + return self.gate is not None + + @property + def _quant_method(self) -> FusedMoEMethodBase: + return self.routed_experts.quant_method + + @property + def shared_experts(self) -> SharedExperts | None: + return self._shared_experts + + # TODO(bnell): Temporary hack. Get rid of this. + def _replace_quant_method(self, quant_method: FusedMoEMethodBase): + self.routed_experts.quant_method = quant_method + if self.shared_experts is not None: + self.shared_experts._mk_can_overlap_shared_experts = ( + quant_method.mk_can_overlap_shared_experts + ) + + def _select_forward(self) -> Callable: + if current_platform.is_tpu() or current_platform.is_cpu(): + # TODO: Once the OOM issue for the TPU backend is resolved, we + # will switch to using the moe_forward custom op. + # Note: CPU doesn't require wrapped _forward_impl. + return _moe_forward if self.shared_experts is None else _moe_forward_shared + + return ( + torch.ops.vllm.moe_forward + if self.shared_experts is None + else torch.ops.vllm.moe_forward_shared + ) + + def apply_routed_input_transform( + self, hidden_states: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor | None]: + """Apply transform for routed experts (e.g., latent projection). + + This is called by FusedMoE.forward_native. The original hidden_states + is saved separately so shared experts get [S, hidden_size] while + routed experts get the transformed [S, moe_latent_size]. + + Returns (possibly transformed) hidden states and the input for shared + experts (or None if there are no shared experts). + """ + if self.routed_input_transform is not None: + result = self.routed_input_transform(hidden_states) + # ReplicatedLinear returns (output, extra_bias) tuple. + # We only need the output tensor; extra_bias is not used here. + if isinstance(result, tuple): + return result[0], hidden_states + return result, hidden_states + + return ( + hidden_states, + hidden_states if self.shared_experts is not None else None, + ) + + def apply_routed_output_transform( + self, + fused_output: torch.Tensor, + ) -> torch.Tensor: + """Apply transform to routed expert output (e.g., latent to full dim). + + Used by latent MoE models (e.g., NemotronH) where routed experts + operate in a compressed latent space and need projection back to + the full hidden dimension before combining with shared expert output. + """ + if self.routed_output_transform is not None: + r = self.routed_output_transform(fused_output) + fused_output = r[0] if isinstance(r, tuple) else r + return fused_output + + def _maybe_apply_output_scale( + self, + shared_output: torch.Tensor | None, + fused_output: torch.Tensor, + ) -> tuple[torch.Tensor | None, torch.Tensor]: + """Apply routed_scaling_factor to the output with FP16 overflow + protection. + + When apply_scale_to_output is True, scales the fused expert output + by routed_scaling_factor. For FP16, avoids overflow by dividing + shared_output by the scale instead (the decoder layer compensates + with matching divisions). + """ + if self.apply_scale_to_output: + if fused_output.dtype != torch.float16: + fused_output *= self.routed_scaling_factor + elif shared_output is not None: + shared_output *= 1.0 / self.routed_scaling_factor + return shared_output, fused_output + + def _must_reduce_shared_expert_output(self) -> bool: + """ + The shared_experts are typically computed using the RowParallelLinear + layer. The result of this function is typically used as + the reduce_results argument to the module. + When just tensor-parallel is used, it is not required to reduce + the shared_experts results immediately. Instead we reduce at the + once at the end of the MoE op. (Refer to DeepSeekV2MoE module) + With EP and all2all kernels - this is no longer viable as all + GPU ranks in DP, produce the complete set of hidden_states. + Therefore it is required that we reduce the shared_experts output + early. + """ + return ( + self.shared_experts is not None + and self.routed_experts.quant_method.moe_kernel is not None + and self.routed_experts.quant_method.moe_kernel.output_is_reduced() + ) + + def _maybe_reduce_shared_expert_output( + self, + shared_output: torch.Tensor | None, + ) -> torch.Tensor | None: + """All-reduce shared expert output when the combine kernel already + reduced fused output. + + This is the "early" all-reduce path. When the combine kernel produces + already-reduced fused output, shared output must be reduced separately + to match. See _must_reduce_shared_expert_output for details. + """ + if self._must_reduce_shared_expert_output(): + assert shared_output is not None + shared_output = tensor_model_parallel_all_reduce(shared_output) + return shared_output + + def _maybe_reduce_output( + self, + states: torch.Tensor, + trunc_size: int, + ) -> torch.Tensor: + """Truncate padded dimensions and all-reduce the combined output. + + This is the "late" all-reduce path. When neither fused nor shared + output was individually reduced, the combined sum is all-reduced + here. Skipped when sequence-parallel is active (SP handles its + own reduction) or when the early path already reduced both outputs. + """ + result = states[..., :trunc_size] + + if ( + not self.moe_config.is_sequence_parallel + and (self.moe_config.tp_size > 1 or self.moe_config.ep_size > 1) + and not self._must_reduce_shared_expert_output() + ): + result = tensor_model_parallel_all_reduce(result) + + return result + + def _encode_layer_name(self) -> str | ModuleName: + """Return the layer name string for custom op layer lookup. + + When torch.compile is active, returns "from_forward_context" so the + custom op resolves the layer via ForwardContext at runtime (avoiding + graph breaks). Falls back to the literal layer name for unit tests + or when ForwardContext is unavailable. + """ + if HAS_OPAQUE_TYPE: + return ModuleName(self.layer_name) + # Can be unavailable or None in unittests + if ( + is_forward_context_available() + and get_forward_context().all_moe_layers is not None + ): + return "from_forward_context" + return self.layer_name + + def _maybe_pad_hidden_states( + self, + shared_experts_input: torch.Tensor | None, + hidden_states: torch.Tensor, + ) -> tuple[torch.Tensor, int]: + """Pad hidden_states to moe_config.hidden_dim and compute the + original dimension for later truncation. + + For latent MoE, the routed hidden_states may be smaller than + hidden_dim. Padding ensures uniform tensor sizes through the + fused MoE kernel. The returned trunc_size is used by + _maybe_reduce_output to strip the padding from the result. + """ + shared_experts_hidden_dim = ( + shared_experts_input.shape[-1] if shared_experts_input is not None else 0 + ) + transformed_hidden_dim = hidden_states.shape[-1] + if ( + not self.routed_experts.quant_method.skip_forward_padding + and self.moe_config.hidden_dim != transformed_hidden_dim + ): + hidden_states = F.pad( + hidden_states, + (0, self.moe_config.hidden_dim - transformed_hidden_dim), + mode="constant", + value=0.0, + ) + + if self.routed_output_transform is not None and shared_experts_hidden_dim > 0: + orig_hidden_dims = shared_experts_hidden_dim + else: + orig_hidden_dims = transformed_hidden_dim + + return hidden_states, orig_hidden_dims + + def _maybe_apply_shared_experts( + self, + shared_experts_input: torch.Tensor | None, + order: SharedExpertsOrder, + ): + """Trigger shared expert computation at the specified ordering point. + + Shared experts can run at different points relative to routed experts + (EXTERNAL, BEFORE_QUANT_METHOD, AFTER_QUANT_METHOD) depending on the + model's overlap strategy. Only fires if shared experts are configured + and the order matches the shared experts' configured execution point. + """ + if self.shared_experts is not None: + assert shared_experts_input is not None + self.shared_experts(shared_experts_input, order) + + def _apply_quant_method( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + ) -> tuple[torch.Tensor | None, torch.Tensor]: + """Run expert routing and the fused MoE kernel via the quant method. + + Orchestrates shared expert execution (before/after), expert selection + via the router, and the actual fused MoE computation. Returns + (shared_expert_output, fused_expert_output). + """ + # Run this before quant_method to avoid inplace issues. + self._maybe_apply_shared_experts( + shared_experts_input, + SharedExpertsOrder.NO_OVERLAP, + ) + + if self.routed_experts.quant_method.is_monolithic: + # Monolithic kernels: pass router_logits to routed_experts + fused_out = self.routed_experts( + x=hidden_states, + router_logits=router_logits, + ) + else: + # Modular kernels: select experts first, then call routed_experts + topk_weights, topk_ids = self.router.select_experts( + hidden_states=hidden_states, + router_logits=router_logits, + topk_indices_dtype=self.routed_experts.quant_method.topk_indices_dtype, + ) + + fused_out = self.routed_experts( + x=hidden_states, + topk_weights=topk_weights, + topk_ids=topk_ids, + shared_experts_input=shared_experts_input, + ) + + self._maybe_apply_shared_experts( + shared_experts_input, + SharedExpertsOrder.MULTI_STREAM_OVERLAPPED, + ) + + return ( + self.shared_experts.output if self.shared_experts is not None else None, + fused_out, + ) + + def _sequence_parallel_context(self): + """Return a context manager for sequence-parallel token + redistribution. + + When sequence parallelism is active, returns a context that handles + local size tracking for proper token scatter/gather. Otherwise + returns a no-op context. + """ + ctx = get_forward_context() + return ( + ctx.dp_metadata.sp_local_sizes(self.moe_config.sp_size) + if ctx.dp_metadata + else nullcontext() + ) + + def _maybe_overlap_gate_with_shared_experts( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + ) -> torch.Tensor: + """Apply the gate module to compute router logits if provided. + + Used in overlapped mode where shared experts run in parallel with + routed experts on a separate CUDA stream. The gate is separated + from the router to allow this parallel execution. + """ + # If router/gate provided, then apply it here. + # (Note: This code runs only when "overlapped mode" is on to allow + # parallel execution of shared experts with the FusedMoE via + # separate cuda stream) + if self.shared_experts is not None: + self.shared_experts.maybe_setup_shared_experts_stream(shared_experts_input) + + if self.gate is not None: + router_logits, _ = self.gate(hidden_states) + + return router_logits + + def _maybe_add_zero_expert_output( + self, + result: torch.Tensor, + ) -> torch.Tensor: + """Add the zero expert's contribution to the final result. + + When a ZeroExpertRouter is used, it computes a bias-like output + from the "zero expert" that is added to the combined routed+shared + expert output. + """ + if isinstance(self.router, ZeroExpertRouter): + zero_expert_output = self.router.zero_expert_output + assert zero_expert_output is not None + result = result + zero_expert_output + return result - @abstractmethod def forward( self, hidden_states: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor: - raise NotImplementedError + """Invoke the fused moe layer. - @property - @abstractmethod - def shared_experts(self) -> SharedExperts | None: - raise NotImplementedError + Input: + - hidden_states + - router_logits + + Output: + - The new hidden_states. + + Calling sequence + - forward + - self._forward_entry (_moe_forward or _moe_forward_shared custom op) + - _forward_impl + + Note: The existence of _moe_forward and _moe_forward_shared custom ops are due + to the following reason: + 1. pytorch cannot handle union types in custom op signatures so + _moe_forward and _moe_forward_shared must be split. + + """ + + # Apply transform for routed experts (e.g., latent projection + # for latent MoE) + hidden_states, shared_experts_input = self.apply_routed_input_transform( + hidden_states, + ) + + hidden_states, og_hidden_dim = self._maybe_pad_hidden_states( + shared_experts_input, + hidden_states, + ) + + router_logits = self._maybe_overlap_gate_with_shared_experts( + hidden_states, + router_logits, + shared_experts_input, + ) + + self._maybe_apply_shared_experts( + shared_experts_input, + SharedExpertsOrder.EXTERNAL, + ) + + result = self._forward_entry( + hidden_states, + router_logits, + shared_experts_input, + self._encode_layer_name(), + ) + + # + # Note: there are two all-reduce points below. They are mutually + # exclusive, controlled by _must_reduce_shared_expert_output(): + # - When True: the combine kernel already reduced fused_output, + # so we reduce shared_output here to match, then skip the + # all-reduce in _maybe_reduce_output. + # - When False: neither output is reduced yet, so we combine + # them first and all-reduce the sum in _maybe_reduce_output. + + # Extract outputs from result + shared_output, fused_output = _unpack(result) + + # Apply output transform (e.g. latent -> full dim) + fused_output = self.apply_routed_output_transform(fused_output) + + # If combine kernel already reduced fused, reduce shared to match. + # See note above re: the two all-reduce points. + shared_output = self._maybe_reduce_shared_expert_output(shared_output) + + shared_output, fused_output = self._maybe_apply_output_scale( + shared_output, fused_output + ) + + if shared_output is not None: + result = shared_output + fused_output + else: + result = fused_output + + result = self._maybe_reduce_output(result, og_hidden_dim) + + return self._maybe_add_zero_expert_output(result) @property - @abstractmethod - def is_internal_router(self) -> bool: - raise NotImplementedError + def do_naive_dispatch_combine(self) -> bool: + return ( + self.moe_config.dp_size > 1 + and not self.routed_experts.quant_method.supports_internal_mk + ) - # Temporary hack - @abstractmethod - def _replace_quant_method(self, quant_method: FusedMoEMethodBase): - raise NotImplementedError + def _maybe_dispatch( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + # For naive dispatch/combine Dp/Ep, dispatch the hidden states and + # router logits to all experts. + # NOTE: this will be removed once all kernels are migrated into the + # MoEKernel framework. + if self.do_naive_dispatch_combine: + hidden_states, router_logits = get_ep_group().dispatch_router_logits( + hidden_states, + router_logits, + self.moe_config.is_sequence_parallel, + ) + + # NOTE: Similar with DP, PCP also needs dispatch and combine. For + # simplicity, AgRsAll2All was added separately for PCP here. Maybe + # we should modify All2AllManager abstraction to better support PCP. + if self.moe_config.pcp_size > 1: + hidden_states = get_pcp_group().all_gather( + hidden_states, + dim=0, + ) + router_logits = get_pcp_group().all_gather( + router_logits, + dim=0, + ) + + return hidden_states, router_logits + + def _maybe_combine( + self, + shared_output: torch.Tensor | None, + hidden_states: torch.Tensor, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]: + if self.do_naive_dispatch_combine: + hidden_states = get_ep_group().combine( + hidden_states, self.moe_config.is_sequence_parallel + ) + + if self.moe_config.pcp_size > 1: + hidden_states = get_pcp_group().reduce_scatter( + hidden_states, + dim=0, + ) + + if self.shared_experts is not None: + assert shared_output is not None + return shared_output, hidden_states + else: + return hidden_states - ######################################################################## + def _forward_impl( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + """Entry point called by the custom op to run the MoE computation. + + Handles pre-dispatch setup (gate application, external shared expert + triggering, quant config init) then delegates to _forward_impl within + the sequence-parallel context. + """ + # TODO(bnell): this can be removed after MK migration is complete. + self.routed_experts._ensure_moe_quant_config_init() + + with self._sequence_parallel_context(): + # TODO(bnell): parts of the dispatch/combine steps will go away once + # #32567 lands and the remaining kernels are made MKs. The PCP + # code will probably remain + hidden_states, router_logits = self._maybe_dispatch( + hidden_states, + router_logits, + ) + + shared_output, hidden_states = self._apply_quant_method( + hidden_states=hidden_states, + router_logits=router_logits, + shared_experts_input=shared_experts_input, + ) + + return self._maybe_combine( + shared_output, + hidden_states, + ) + + # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX # - # FusedMoE layer methods + # Old methods from FusedMoE layer # - ######################################################################## + # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - @abstractmethod + # Note: maybe_init_modular_kernel should only be called by + # prepare_communication_buffer_for_model. + # This is called after all weight loading and post-processing, so it + # should be safe to swap out the quant_method. def maybe_init_modular_kernel(self) -> None: - raise NotImplementedError + # NOTE(rob): WIP refactor. For quant methods that own the MK + # we create the MK during process_weights_after_loading. + if ( + self.routed_experts.quant_method.supports_internal_mk + or self.routed_experts.quant_method.is_monolithic + ): + return None + + self.routed_experts._ensure_moe_quant_config_init() + # routing_tables only needed for round-robin expert placement with + # DeepEP all2all backend. + routing_tables = self._maybe_init_expert_routing_tables() + + if isinstance(self.routed_experts.quant_method, FusedMoEModularMethod): + base_quant_method = self.routed_experts.quant_method.old_quant_method + else: + base_quant_method = self.routed_experts.quant_method + + prepare_finalize = base_quant_method.maybe_make_prepare_finalize( + routing_tables=routing_tables + ) + if prepare_finalize is not None: + logger.debug( + "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self) + ) + self._replace_quant_method( + FusedMoEModularMethod.make( + self, + base_quant_method, + prepare_finalize, + self.shared_experts, + inplace=not base_quant_method.moe.disable_inplace, + ) + ) + + # + # Properties + # @property - @abstractmethod def layer_id(self): - raise NotImplementedError + # Delayed import to avoid circular dependency + from vllm.model_executor.models.utils import extract_layer_index + + return extract_layer_index(self.layer_name) # # Attributes still needed by models # @property - @abstractmethod def is_monolithic(self) -> bool: - raise NotImplementedError + return self.routed_experts.quant_method.is_monolithic @property - @abstractmethod def activation(self) -> MoEActivation: - raise NotImplementedError + return self.routed_experts.activation # # Expert maps # @property - @abstractmethod def expert_placement_strategy(self) -> ExpertPlacementStrategy: - raise NotImplementedError + return self.expert_map_manager.placement_strategy @property - @abstractmethod def expert_global_to_physical(self) -> torch.Tensor | None: - raise NotImplementedError + tables = self.expert_map_manager.routing_tables + return tables[0] if tables else None @property - @abstractmethod def expert_physical_to_global(self) -> torch.Tensor | None: - raise NotImplementedError + """Routing table: physical expert ID to global expert ID.""" + tables = self.expert_map_manager.routing_tables + return tables[1] if tables else None @property - @abstractmethod def expert_local_to_global(self) -> torch.Tensor | None: - raise NotImplementedError + """Routing table: local expert ID to global expert ID.""" + tables = self.expert_map_manager.routing_tables + return tables[2] if tables else None @property - @abstractmethod def expert_map(self) -> torch.Tensor | None: - raise NotImplementedError + return self.routed_experts.expert_map - @abstractmethod def _maybe_init_expert_routing_tables( self, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: - raise NotImplementedError + return self.routed_experts._maybe_init_expert_routing_tables() - @abstractmethod def update_expert_map(self): - raise NotImplementedError + self.routed_experts.update_expert_map() - @abstractmethod def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: - raise NotImplementedError + """Map global expert ID to local expert ID.""" + return self.routed_experts._map_global_expert_id_to_local_expert_id(expert_id) + + # + # EPLB + # - @abstractmethod def get_expert_weights(self) -> Iterable[torch.Tensor]: - raise NotImplementedError + """Delegate to EPLB manager.""" + if self.router.eplb_manager is not None: + return self.router.eplb_manager.get_expert_weights(self.routed_experts) + else: + return [] - @abstractmethod def set_eplb_state( self, moe_layer_idx: int, @@ -136,4 +867,16 @@ def set_eplb_state( logical_to_physical_map: torch.Tensor, logical_replica_count: torch.Tensor, ) -> None: - raise NotImplementedError + """ + Register the EPLB state in this layer. + + This is used later in forward pass, where we get the expert mapping + and record the load metrics in `expert_load_view`. + """ + if self.router.eplb_manager is not None: + self.router.eplb_manager.set_state( + moe_layer_idx, + expert_load_view, + logical_to_physical_map, + logical_replica_count, + ) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py deleted file mode 100644 index 914379c3bf8c..000000000000 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_base.py +++ /dev/null @@ -1,828 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from abc import abstractmethod -from collections.abc import Callable, Iterable -from contextlib import nullcontext -from typing import TYPE_CHECKING - -import torch -import torch.nn.functional as F - -from vllm.config.parallel import ExpertPlacementStrategy -from vllm.distributed import ( - tensor_model_parallel_all_reduce, -) -from vllm.forward_context import ( - ForwardContext, - get_forward_context, - is_forward_context_available, -) -from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.activation import MoEActivation -from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEConfig, -) -from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( - FusedMoEMethodBase, -) -from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import ( - FusedMoEModularMethod, -) -from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts -from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( - FusedMoERouter, -) -from vllm.model_executor.layers.fused_moe.router.zero_expert_router import ( - ZeroExpertRouter, -) -from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner -from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( - SharedExperts, - SharedExpertsOrder, -) -from vllm.platforms import current_platform -from vllm.utils.torch_utils import ( - HAS_OPAQUE_TYPE, - ModuleName, - direct_register_custom_op, -) - -logger = init_logger(__name__) - - -def get_layer_from_name(layer_name: str) -> MoERunner: - forward_context: ForwardContext = get_forward_context() - if layer_name == "from_forward_context": - all_moe_layers = forward_context.all_moe_layers - assert all_moe_layers is not None - moe_layer_index = forward_context.moe_layer_index - if moe_layer_index >= len(all_moe_layers): - raise AssertionError( - "We expected the number of MOE layers in `all_moe_layers` " - "to be equal to the number of " - "{vllm.moe_forward, vllm.moe_forward_shared} calls." - ) - layer_name = all_moe_layers[moe_layer_index] - forward_context.moe_layer_index += 1 - layer = forward_context.no_compile_layers[layer_name] - assert isinstance(layer, MoERunner) - return layer - - -# On torch >= 2.11, layer_name is a hoisted ModuleName opaque object; -# on older versions it remains a plain str. -if TYPE_CHECKING: - from typing import TypeAlias - - _layer_name_type: TypeAlias = str | ModuleName -else: - _layer_name_type = ModuleName if HAS_OPAQUE_TYPE else str - - -def _resolve_layer_name(layer_name: str | ModuleName) -> str: - return layer_name.value if isinstance(layer_name, ModuleName) else layer_name - - -# Note: _moe_forward and _moe_forward_shared should not contain any -# implementation details, They should merely pass along control to -# the runner's '_forward_dispatch' method. -# These functions should never be called directly since they do not -# include all the functionality of the MoE layer. -def _moe_forward( - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - layer_name: _layer_name_type, -) -> torch.Tensor: - layer = get_layer_from_name(_resolve_layer_name(layer_name)) - return layer._forward_dispatch( - hidden_states, - router_logits, - shared_experts_input, - ) - - -def _moe_forward_fake( - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - layer_name: _layer_name_type, -) -> torch.Tensor: - return torch.empty_like(hidden_states) - - -def _moe_forward_shared( - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - layer_name: _layer_name_type, -) -> tuple[torch.Tensor, torch.Tensor]: - layer = get_layer_from_name(_resolve_layer_name(layer_name)) - return layer._forward_dispatch( - hidden_states, - router_logits, - shared_experts_input, - ) - - -def _moe_forward_shared_fake( - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - layer_name: _layer_name_type, -) -> tuple[torch.Tensor, torch.Tensor]: - # Output shapes: - # - fused_out: same as hidden_states (routed experts use transformed size) - # - shared_out: same as shared_experts_input if provided, else same as - # hidden_states - # (For latent MoE: shared experts use original hidden_size, not latent size) - fused_out = torch.empty_like(hidden_states) - if shared_experts_input is not None: - shared_out = torch.empty_like(shared_experts_input) - else: - shared_out = torch.empty_like(hidden_states) - return shared_out, fused_out - - -direct_register_custom_op( - op_name="moe_forward", - op_func=_moe_forward, - mutates_args=["hidden_states"], - fake_impl=_moe_forward_fake, - tags=(torch.Tag.needs_fixed_stride_order,), -) - - -direct_register_custom_op( - op_name="moe_forward_shared", - op_func=_moe_forward_shared, - fake_impl=_moe_forward_shared_fake, - tags=(torch.Tag.needs_fixed_stride_order,), -) - - -def _unpack( - result: torch.Tensor | tuple[torch.Tensor, torch.Tensor], -) -> tuple[torch.Tensor | None, torch.Tensor]: - if isinstance(result, tuple): - return result - else: - return (None, result) - - -class MoERunnerBase(MoERunner): - """ - Abstract base class providing common functionality for MoE runner implementations. - - This class serves as the foundation for concrete MoE runner implementations by - providing shared state management and common utilities. It handles: - - Common initialization and configuration management - - Shared expert output reduction logic for tensor parallel scenarios - - Base methods for tensor model parallel reductions - - Common properties and utility functions used across different runner types - - Concrete subclasses must implement the abstract methods to define their specific - execution strategies, such as standard execution, chunked processing, or other - specialized approaches. The base class provides the infrastructure while - allowing flexibility in the actual MoE computation implementation. - - Key abstract methods that subclasses must implement: - - _forward_impl: The core MoE computation logic specific to each runner type - """ - - def __init__( - self, - layer_name: str, - moe_config: FusedMoEConfig, - router: FusedMoERouter, - routed_input_transform: torch.nn.Module | None, - gate: torch.nn.Module | None, - shared_experts: torch.nn.Module | None, - routed_experts: RoutedExperts, - enable_dbo: bool, - routed_output_transform: torch.nn.Module | None = None, - apply_scale_to_output: bool = False, - routed_scaling_factor: float = 1.0, - ): - super().__init__() - self.moe_config = moe_config - self.router = router - self.routed_input_transform = routed_input_transform - self.routed_output_transform = routed_output_transform - self.gate = gate - self._shared_experts: SharedExperts | None = None - if shared_experts is not None: - self._shared_experts = SharedExperts( - shared_experts, - moe_config=moe_config, - mk_owns_shared_expert=routed_experts.quant_method.mk_owns_shared_expert, - ) - self.routed_experts = routed_experts - self.enable_dbo = enable_dbo - self.enable_eplb = moe_config.moe_parallel_config.enable_eplb - self.apply_scale_to_output = ( - apply_scale_to_output and routed_scaling_factor != 1.0 - ) - self.routed_scaling_factor = routed_scaling_factor - - # Needed for string -> MoERunner layer lookup in custom ops. - self.layer_name = layer_name - - self._forward_entry = self._select_forward() - - @property - def is_internal_router(self) -> bool: - return self.gate is not None - - @property - def _quant_method(self) -> FusedMoEMethodBase: - return self.routed_experts.quant_method - - @property - def shared_experts(self) -> SharedExperts | None: - return self._shared_experts - - # TODO(bnell): Temporary hack. Get rid of this. - def _replace_quant_method(self, quant_method: FusedMoEMethodBase): - self.routed_experts.quant_method = quant_method - if self.shared_experts is not None: - self.shared_experts._mk_owns_shared_expert = ( - quant_method.mk_owns_shared_expert - ) - - def _select_forward(self) -> Callable: - if current_platform.is_tpu() or current_platform.is_cpu(): - # TODO: Once the OOM issue for the TPU backend is resolved, we - # will switch to using the moe_forward custom op. - # Note: CPU doesn't require wrapped _forward_impl. - return _moe_forward if self.shared_experts is None else _moe_forward_shared - - return ( - torch.ops.vllm.moe_forward - if self.shared_experts is None - else torch.ops.vllm.moe_forward_shared - ) - - def apply_routed_input_transform( - self, hidden_states: torch.Tensor - ) -> tuple[torch.Tensor, torch.Tensor | None]: - """Apply transform for routed experts (e.g., latent projection). - - This is called by FusedMoE.forward_native. The original hidden_states - is saved separately so shared experts get [S, hidden_size] while - routed experts get the transformed [S, moe_latent_size]. - - Returns (possibly transformed) hidden states and the input for shared - experts (or None if there are no shared experts). - """ - if self.routed_input_transform is not None: - result = self.routed_input_transform(hidden_states) - # ReplicatedLinear returns (output, extra_bias) tuple. - # We only need the output tensor; extra_bias is not used here. - if isinstance(result, tuple): - return result[0], hidden_states - return result, hidden_states - - return ( - hidden_states, - hidden_states if self.shared_experts is not None else None, - ) - - def apply_routed_output_transform( - self, - fused_output: torch.Tensor, - ) -> torch.Tensor: - """Apply transform to routed expert output (e.g., latent to full dim). - - Used by latent MoE models (e.g., NemotronH) where routed experts - operate in a compressed latent space and need projection back to - the full hidden dimension before combining with shared expert output. - """ - if self.routed_output_transform is not None: - r = self.routed_output_transform(fused_output) - fused_output = r[0] if isinstance(r, tuple) else r - return fused_output - - def _maybe_apply_output_scale( - self, - shared_output: torch.Tensor | None, - fused_output: torch.Tensor, - ) -> tuple[torch.Tensor | None, torch.Tensor]: - """Apply routed_scaling_factor to the output with FP16 overflow - protection. - - When apply_scale_to_output is True, scales the fused expert output - by routed_scaling_factor. For FP16, avoids overflow by dividing - shared_output by the scale instead (the decoder layer compensates - with matching divisions). - """ - if self.apply_scale_to_output: - if fused_output.dtype != torch.float16: - fused_output *= self.routed_scaling_factor - elif shared_output is not None: - shared_output *= 1.0 / self.routed_scaling_factor - return shared_output, fused_output - - def _must_reduce_shared_expert_output(self) -> bool: - """ - The shared_experts are typically computed using the RowParallelLinear - layer. The result of this function is typically used as - the reduce_results argument to the module. - When just tensor-parallel is used, it is not required to reduce - the shared_experts results immediately. Instead we reduce at the - once at the end of the MoE op. (Refer to DeepSeekV2MoE module) - With EP and all2all kernels - this is no longer viable as all - GPU ranks in DP, produce the complete set of hidden_states. - Therefore it is required that we reduce the shared_experts output - early. - """ - return ( - self.shared_experts is not None - and self.routed_experts.quant_method.moe_kernel is not None - and self.routed_experts.quant_method.moe_kernel.output_is_reduced() - ) - - def _maybe_reduce_shared_expert_output( - self, - shared_output: torch.Tensor | None, - ) -> torch.Tensor | None: - """All-reduce shared expert output when the combine kernel already - reduced fused output. - - This is the "early" all-reduce path. When the combine kernel produces - already-reduced fused output, shared output must be reduced separately - to match. See _must_reduce_shared_expert_output for details. - """ - if self._must_reduce_shared_expert_output(): - assert shared_output is not None - shared_output = tensor_model_parallel_all_reduce(shared_output) - return shared_output - - def _maybe_reduce_output( - self, - states: torch.Tensor, - trunc_size: int, - ) -> torch.Tensor: - """Truncate padded dimensions and all-reduce the combined output. - - This is the "late" all-reduce path. When neither fused nor shared - output was individually reduced, the combined sum is all-reduced - here. Skipped when sequence-parallel is active (SP handles its - own reduction) or when the early path already reduced both outputs. - """ - result = states[..., :trunc_size] - - if ( - not self.moe_config.is_sequence_parallel - and (self.moe_config.tp_size > 1 or self.moe_config.ep_size > 1) - and not self._must_reduce_shared_expert_output() - ): - result = tensor_model_parallel_all_reduce(result) - - return result - - def _encode_layer_name(self) -> str | ModuleName: - """Return the layer name string for custom op layer lookup. - - When torch.compile is active, returns "from_forward_context" so the - custom op resolves the layer via ForwardContext at runtime (avoiding - graph breaks). Falls back to the literal layer name for unit tests - or when ForwardContext is unavailable. - """ - if HAS_OPAQUE_TYPE: - return ModuleName(self.layer_name) - # Can be unavailable or None in unittests - if ( - is_forward_context_available() - and get_forward_context().all_moe_layers is not None - ): - return "from_forward_context" - return self.layer_name - - def _maybe_pad_hidden_states( - self, - shared_experts_input: torch.Tensor | None, - hidden_states: torch.Tensor, - ) -> tuple[torch.Tensor, int]: - """Pad hidden_states to moe_config.hidden_dim and compute the - original dimension for later truncation. - - For latent MoE, the routed hidden_states may be smaller than - hidden_dim. Padding ensures uniform tensor sizes through the - fused MoE kernel. The returned trunc_size is used by - _maybe_reduce_output to strip the padding from the result. - """ - shared_experts_hidden_dim = ( - shared_experts_input.shape[-1] if shared_experts_input is not None else 0 - ) - transformed_hidden_dim = hidden_states.shape[-1] - if ( - not self.routed_experts.quant_method.skip_forward_padding - and self.moe_config.hidden_dim != transformed_hidden_dim - ): - hidden_states = F.pad( - hidden_states, - (0, self.moe_config.hidden_dim - transformed_hidden_dim), - mode="constant", - value=0.0, - ) - - if self.routed_output_transform is not None and shared_experts_hidden_dim > 0: - orig_hidden_dims = shared_experts_hidden_dim - else: - orig_hidden_dims = transformed_hidden_dim - - return hidden_states, orig_hidden_dims - - def _maybe_apply_shared_experts( - self, - shared_experts_input: torch.Tensor | None, - order: SharedExpertsOrder, - ): - """Trigger shared expert computation at the specified ordering point. - - Shared experts can run at different points relative to routed experts - (EXTERNAL, BEFORE_QUANT_METHOD, AFTER_QUANT_METHOD) depending on the - model's overlap strategy. Only fires if shared experts are configured - and the order matches the shared experts' configured execution point. - """ - if self.shared_experts is not None: - assert shared_experts_input is not None - self.shared_experts(shared_experts_input, order) - - def _apply_quant_method( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - ) -> tuple[torch.Tensor | None, torch.Tensor]: - """Run expert routing and the fused MoE kernel via the quant method. - - Orchestrates shared expert execution (before/after), expert selection - via the router, and the actual fused MoE computation. Returns - (shared_expert_output, fused_expert_output). - """ - # Run this before quant_method to avoid inplace issues. - self._maybe_apply_shared_experts( - shared_experts_input, - SharedExpertsOrder.NO_OVERLAP, - ) - - if self.routed_experts.quant_method.is_monolithic: - # Monolithic kernels: pass router_logits to routed_experts - fused_out = self.routed_experts.forward( - x=hidden_states, - router_logits=router_logits, - ) - else: - # Modular kernels: select experts first, then call routed_experts - topk_weights, topk_ids = self.router.select_experts( - hidden_states=hidden_states, - router_logits=router_logits, - topk_indices_dtype=self.routed_experts.quant_method.topk_indices_dtype, - ) - - fused_out = self.routed_experts.forward( - x=hidden_states, - topk_weights=topk_weights, - topk_ids=topk_ids, - shared_experts_input=shared_experts_input, - ) - - self._maybe_apply_shared_experts( - shared_experts_input, - SharedExpertsOrder.MULTI_STREAM_OVERLAPPED, - ) - - return ( - self.shared_experts.output if self.shared_experts is not None else None, - fused_out, - ) - - def _sequence_parallel_context(self): - """Return a context manager for sequence-parallel token - redistribution. - - When sequence parallelism is active, returns a context that handles - local size tracking for proper token scatter/gather. Otherwise - returns a no-op context. - """ - ctx = get_forward_context() - return ( - ctx.dp_metadata.sp_local_sizes(self.moe_config.sp_size) - if ctx.dp_metadata - else nullcontext() - ) - - def _maybe_overlap_gate_with_shared_experts( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor: - """Apply the gate module to compute router logits if provided. - - Used in overlapped mode where shared experts run in parallel with - routed experts on a separate CUDA stream. The gate is separated - from the router to allow this parallel execution. - """ - # If router/gate provided, then apply it here. - # (Note: This code runs only when "overlapped mode" is on to allow - # parallel execution of shared experts with the FusedMoE via - # separate cuda stream) - if self.shared_experts is not None: - self.shared_experts.maybe_setup_shared_experts_stream(shared_experts_input) - - if self.gate is not None: - router_logits, _ = self.gate(hidden_states) - - return router_logits - - def _maybe_add_zero_expert_output( - self, - result: torch.Tensor, - ) -> torch.Tensor: - """Add the zero expert's contribution to the final result. - - When a ZeroExpertRouter is used, it computes a bias-like output - from the "zero expert" that is added to the combined routed+shared - expert output. - """ - if isinstance(self.router, ZeroExpertRouter): - zero_expert_output = self.router.zero_expert_output - assert zero_expert_output is not None - result = result + zero_expert_output - return result - - def forward( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - ) -> torch.Tensor: - """Invoke the fused moe layer. - - Input: - - hidden_states - - router_logits - - Output: - - The new hidden_states. - - Calling sequence - - forward - - self._forward_entry (_moe_forward or _moe_forward_shared custom op) - - _forward_dispatch - - _forward_impl - - Note: The existence of _moe_forward and _moe_forward_shared custom ops are due - to the following reasons: - 1. the chunking loop in ChunkingMoERunner._forward_impl cannot be compiled by - torch.compile - 2. pytorch cannot handle union types in custom op signatures so - _moe_forward and _moe_forward_shared must be split. - - If ChunkingMoERunner._forward_impl can be implemented via torch.scan we can - potentially get rid of _moe_forward and _moe_forward_shared and collapse the - whole sequence into the 'forward' method. - """ - - # Apply transform for routed experts (e.g., latent projection - # for latent MoE) - hidden_states, shared_experts_input = self.apply_routed_input_transform( - hidden_states, - ) - - hidden_states, og_hidden_dim = self._maybe_pad_hidden_states( - shared_experts_input, - hidden_states, - ) - - router_logits = self._maybe_overlap_gate_with_shared_experts( - hidden_states, - router_logits, - shared_experts_input, - ) - - self._maybe_apply_shared_experts( - shared_experts_input, - SharedExpertsOrder.EXTERNAL, - ) - - result = self._forward_entry( - hidden_states, - router_logits, - shared_experts_input, - self._encode_layer_name(), - ) - - # - # Note: there are two all-reduce points below. They are mutually - # exclusive, controlled by _must_reduce_shared_expert_output(): - # - When True: the combine kernel already reduced fused_output, - # so we reduce shared_output here to match, then skip the - # all-reduce in _maybe_reduce_output. - # - When False: neither output is reduced yet, so we combine - # them first and all-reduce the sum in _maybe_reduce_output. - - # Extract outputs from result - shared_output, fused_output = _unpack(result) - - # Apply output transform (e.g. latent -> full dim) - fused_output = self.apply_routed_output_transform(fused_output) - - # If combine kernel already reduced fused, reduce shared to match. - # See note above re: the two all-reduce points. - shared_output = self._maybe_reduce_shared_expert_output(shared_output) - - shared_output, fused_output = self._maybe_apply_output_scale( - shared_output, fused_output - ) - - if shared_output is not None: - result = shared_output + fused_output - else: - result = fused_output - - result = self._maybe_reduce_output(result, og_hidden_dim) - - return self._maybe_add_zero_expert_output(result) - - def _forward_dispatch( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - """Entry point called by the custom op to run the MoE computation. - - Handles pre-dispatch setup (gate application, external shared expert - triggering, quant config init) then delegates to _forward_impl within - the sequence-parallel context. - """ - # TODO(bnell): this can be removed after MK migration is complete. - self.routed_experts._ensure_moe_quant_config_init() - - with self._sequence_parallel_context(): - return self._forward_impl( - hidden_states, - router_logits, - shared_experts_input, - ) - - @abstractmethod - def _forward_impl( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - """Core MoE computation to be implemented by subclasses. - - Performs expert routing, fused MoE kernel execution, and shared - expert computation. Returns a single tensor (fused output only) - or a tuple of (shared_output, fused_output) when shared experts - are present. - """ - raise NotImplementedError - - # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - # - # Old methods from FusedMoE layer - # - # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - - # Note: maybe_init_modular_kernel should only be called by - # prepare_communication_buffer_for_model. - # This is called after all weight loading and post-processing, so it - # should be safe to swap out the quant_method. - def maybe_init_modular_kernel(self) -> None: - # NOTE(rob): WIP refactor. For quant methods that own the MK - # we create the MK during process_weights_after_loading. - if ( - self.routed_experts.quant_method.supports_internal_mk - or self.routed_experts.quant_method.is_monolithic - ): - return None - - self.routed_experts._ensure_moe_quant_config_init() - # routing_tables only needed for round-robin expert placement with - # DeepEP all2all backend. - routing_tables = self._maybe_init_expert_routing_tables() - - if isinstance(self.routed_experts.quant_method, FusedMoEModularMethod): - base_quant_method = self.routed_experts.quant_method.old_quant_method - else: - base_quant_method = self.routed_experts.quant_method - - prepare_finalize = base_quant_method.maybe_make_prepare_finalize( - routing_tables=routing_tables - ) - if prepare_finalize is not None: - logger.debug( - "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self) - ) - self._replace_quant_method( - FusedMoEModularMethod.make( - self, - base_quant_method, - prepare_finalize, - self.shared_experts, - inplace=not base_quant_method.moe.disable_inplace, - ) - ) - - # - # Properties - # - - @property - def layer_id(self): - # Delayed import to avoid circular dependency - from vllm.model_executor.models.utils import extract_layer_index - - return extract_layer_index(self.layer_name) - - # - # Attributes still needed by models - # - - @property - def is_monolithic(self) -> bool: - return self.routed_experts.quant_method.is_monolithic - - @property - def activation(self) -> MoEActivation: - return self.routed_experts.activation - - # - # Expert maps - # - - @property - def expert_placement_strategy(self) -> ExpertPlacementStrategy: - return self.expert_map_manager.placement_strategy - - @property - def expert_global_to_physical(self) -> torch.Tensor | None: - tables = self.expert_map_manager.routing_tables - return tables[0] if tables else None - - @property - def expert_physical_to_global(self) -> torch.Tensor | None: - """Routing table: physical expert ID to global expert ID.""" - tables = self.expert_map_manager.routing_tables - return tables[1] if tables else None - - @property - def expert_local_to_global(self) -> torch.Tensor | None: - """Routing table: local expert ID to global expert ID.""" - tables = self.expert_map_manager.routing_tables - return tables[2] if tables else None - - @property - def expert_map(self) -> torch.Tensor | None: - return self.routed_experts.expert_map - - def _maybe_init_expert_routing_tables( - self, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: - return self.routed_experts._maybe_init_expert_routing_tables() - - def update_expert_map(self): - self.routed_experts.update_expert_map() - - def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: - """Map global expert ID to local expert ID.""" - return self.routed_experts._map_global_expert_id_to_local_expert_id(expert_id) - - # - # EPLB - # - - def get_expert_weights(self) -> Iterable[torch.Tensor]: - """Delegate to EPLB manager.""" - if self.router.eplb_manager is not None: - return self.router.eplb_manager.get_expert_weights(self.routed_experts) - else: - return [] - - def set_eplb_state( - self, - moe_layer_idx: int, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - """ - Register the EPLB state in this layer. - - This is used later in forward pass, where we get the expert mapping - and record the load metrics in `expert_load_view`. - """ - if self.router.eplb_manager is not None: - self.router.eplb_manager.set_state( - moe_layer_idx, - expert_load_view, - logical_to_physical_map, - logical_replica_count, - ) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py deleted file mode 100644 index f5a43991e17e..000000000000 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_factory.py +++ /dev/null @@ -1,65 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEConfig, -) -from vllm.model_executor.layers.fused_moe.routed_experts import ( - RoutedExperts, -) -from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( - FusedMoERouter, -) -from vllm.model_executor.layers.fused_moe.runner.chunking_moe_runner import ( - ChunkingMoERunner, -) -from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import ( - DefaultMoERunner, -) -from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner - - -def create_moe_runner( - layer_name: str, - moe_config: FusedMoEConfig, - router: FusedMoERouter, - routed_input_transform: torch.nn.Module | None, - gate: torch.nn.Module | None, - shared_experts: torch.nn.Module | None, - routed_experts: RoutedExperts, - enable_dbo: bool, - routed_output_transform: torch.nn.Module | None = None, - apply_scale_to_output: bool = False, - routed_scaling_factor: float = 1.0, -) -> MoERunner: - runner = DefaultMoERunner( - layer_name, - moe_config, - router, - routed_input_transform, - gate, - shared_experts, - routed_experts, - enable_dbo, - routed_output_transform=routed_output_transform, - apply_scale_to_output=apply_scale_to_output, - routed_scaling_factor=routed_scaling_factor, - ) - if moe_config.moe_parallel_config.use_dp_chunking: - return ChunkingMoERunner( - inner=runner, - layer_name=layer_name, - moe_config=moe_config, - router=router, - routed_input_transform=routed_input_transform, - gate=gate, - shared_experts=shared_experts, - routed_experts=routed_experts, - enable_dbo=enable_dbo, - routed_output_transform=routed_output_transform, - apply_scale_to_output=apply_scale_to_output, - routed_scaling_factor=routed_scaling_factor, - ) - return runner diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_interface.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_interface.py new file mode 100644 index 000000000000..ad7b319e38c3 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_interface.py @@ -0,0 +1,139 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import abstractmethod +from collections.abc import Iterable + +import torch + +from vllm.config.parallel import ExpertPlacementStrategy +from vllm.model_executor.layers.fused_moe.activation import MoEActivation +from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( + FusedMoEMethodBase, +) +from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( + SharedExperts, +) + + +class MoERunnerInterface(torch.nn.Module): + """ + Abstract base class for Mixture of Experts (MoE) runners. + + This class defines the interface that all MoE runner implementations must follow. + MoE runners are responsible for executing the forward pass of MoE layers, handling + expert routing, and managing tensor parallel operations. + """ + + def __init__(self): + super().__init__() + # HACK + self._already_called_process_weights_after_loading = True + + @abstractmethod + def forward( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> torch.Tensor: + raise NotImplementedError + + @property + @abstractmethod + def shared_experts(self) -> SharedExperts | None: + raise NotImplementedError + + @property + @abstractmethod + def is_internal_router(self) -> bool: + raise NotImplementedError + + # Temporary hack + @abstractmethod + def _replace_quant_method(self, quant_method: FusedMoEMethodBase): + raise NotImplementedError + + ######################################################################## + # + # FusedMoE layer methods + # + ######################################################################## + + @abstractmethod + def maybe_init_modular_kernel(self) -> None: + raise NotImplementedError + + @property + @abstractmethod + def layer_id(self): + raise NotImplementedError + + # + # Attributes still needed by models + # + + @property + @abstractmethod + def is_monolithic(self) -> bool: + raise NotImplementedError + + @property + @abstractmethod + def activation(self) -> MoEActivation: + raise NotImplementedError + + # + # Expert maps + # + + @property + @abstractmethod + def expert_placement_strategy(self) -> ExpertPlacementStrategy: + raise NotImplementedError + + @property + @abstractmethod + def expert_global_to_physical(self) -> torch.Tensor | None: + raise NotImplementedError + + @property + @abstractmethod + def expert_physical_to_global(self) -> torch.Tensor | None: + raise NotImplementedError + + @property + @abstractmethod + def expert_local_to_global(self) -> torch.Tensor | None: + raise NotImplementedError + + @property + @abstractmethod + def expert_map(self) -> torch.Tensor | None: + raise NotImplementedError + + @abstractmethod + def _maybe_init_expert_routing_tables( + self, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: + raise NotImplementedError + + @abstractmethod + def update_expert_map(self): + raise NotImplementedError + + @abstractmethod + def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: + raise NotImplementedError + + @abstractmethod + def get_expert_weights(self) -> Iterable[torch.Tensor]: + raise NotImplementedError + + @abstractmethod + def set_eplb_state( + self, + moe_layer_idx: int, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + raise NotImplementedError diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index c1aceacf2efa..1fe4a27a5c30 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -42,14 +42,14 @@ def __init__( self, layer: torch.nn.Module, moe_config: FusedMoEConfig, - mk_owns_shared_expert: bool, + mk_can_overlap_shared_experts: bool, ): super().__init__() self._output: torch.Tensor | None = None self._layer = layer self._moe_config = moe_config - self._mk_owns_shared_expert = mk_owns_shared_expert + self._mk_can_overlap_shared_experts = mk_can_overlap_shared_experts self._use_dp_chunking = moe_config.moe_parallel_config.use_dp_chunking # Allow disabling of the separate shared experts stream for @@ -89,7 +89,7 @@ def _determine_shared_experts_order( if self._has_external_experts and not self._use_dp_chunking: return SharedExpertsOrder.EXTERNAL - if self._mk_owns_shared_expert: + if self._mk_can_overlap_shared_experts: return SharedExpertsOrder.MK_INTERNAL_OVERLAPPED should_run_shared_in_aux_stream = ( diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py index c2ffbeedc41d..46c1ced9b276 100644 --- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py @@ -1,28 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner -# TODO(bnell): Remove this entirely -# class SharedFusedMoE(FusedMoE): -# """ -# A FusedMoE operation that also computes the results of shared experts. -# If an all2all communicator is being used the shared expert computation -# can be interleaved with the fused all2all dispatch communication step. -# """ - -# def forward( -# self, -# hidden_states: torch.Tensor, -# router_logits: torch.Tensor, -# ) -> torch.Tensor: -# return super().forward( -# hidden_states=hidden_states, -# router_logits=router_logits, -# ) - +# TODO(bnell): this will be deleted def SharedFusedMoE(*args, **kwargs) -> MoERunner: return FusedMoE(*args, **kwargs) diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index de100fde9387..1fc478437d31 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -27,6 +27,9 @@ FusedMoEExpertsModular, FusedMoEPrepareAndFinalizeModular, ) +from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( + SharedExperts, +) if TYPE_CHECKING: from vllm.model_executor.layers.fused_moe.routed_experts import ( @@ -97,9 +100,12 @@ def forward_native( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: - return self.forward_cuda(layer, x, topk_weights, topk_ids, shared_experts_input) + return self.forward_cuda( + layer, x, topk_weights, topk_ids, shared_experts, shared_experts_input + ) @property def is_monolithic(self) -> bool: @@ -307,6 +313,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: return self.forward( @@ -314,6 +321,7 @@ def apply( x=x, topk_weights=topk_weights, topk_ids=topk_ids, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) @@ -332,6 +340,7 @@ def forward_cuda( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert self.kernel is not None @@ -346,6 +355,7 @@ def forward_cuda( apply_router_weight_on_input=layer.apply_router_weight_on_input, global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 1e14f715f609..3620773eba1f 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -15,15 +15,14 @@ choose_mp_linear_kernel, ) from vllm.model_executor.layers.fused_moe import ( + FusedMoEConfig, FusedMoEMethodBase, + FusedMoEQuantConfig, FusedMoeWeightScaleSupported, RoutedExperts, + SharedExperts, UnquantizedFusedMoEMethod, ) -from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEConfig, - FusedMoEQuantConfig, -) from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.linear import ( LinearBase, @@ -804,6 +803,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: return fused_marlin_moe( diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 0bfb146f1eca..a32ed3196c1a 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -7,12 +7,11 @@ from packaging import version from vllm.model_executor.layers.fused_moe import ( - FusedMoEMethodBase, - RoutedExperts, -) -from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, + FusedMoEMethodBase, FusedMoEQuantConfig, + RoutedExperts, + SharedExperts, ) from vllm.model_executor.layers.linear import ( LinearBase, @@ -482,6 +481,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe import fused_experts diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index e0747c89ce41..ea4ed6134c01 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -22,6 +22,7 @@ FusedMoEMethodBase, FusedMoeWeightScaleSupported, RoutedExperts, + SharedExperts, UnquantizedFusedMoEMethod, ) from vllm.model_executor.layers.fused_moe.activation import MoEActivation @@ -344,7 +345,6 @@ def process_weights_after_loading(self, layer: RoutedExperts) -> None: moe_config=self.moe, experts_cls=self.experts_cls, mxfp4_backend=self.mxfp4_backend, - shared_experts=layer.shared_experts, routing_tables=layer._maybe_init_expert_routing_tables(), ) @@ -354,6 +354,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert self.moe_kernel is not None @@ -367,6 +368,7 @@ def apply( global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) @@ -573,7 +575,6 @@ def process_weights_after_loading(self, layer: RoutedExperts) -> None: moe_quant_config=self.moe_quant_config, moe_config=self.moe, experts_cls=self.experts_cls, - shared_experts=layer.shared_experts, routing_tables=layer._maybe_init_expert_routing_tables(), ) self.moe_kernel.fused_experts.process_weights_after_loading(layer) @@ -627,6 +628,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert self.moe_kernel is not None @@ -640,6 +642,7 @@ def apply( global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) @@ -935,7 +938,6 @@ def process_weights_after_loading(self, layer: RoutedExperts) -> None: fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, routing_tables=layer._maybe_init_expert_routing_tables(), - shared_experts=layer.shared_experts, ) def maybe_make_prepare_finalize( @@ -988,6 +990,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert not self.is_monolithic @@ -1004,6 +1007,7 @@ def apply( # https://github.com/vllm-project/vllm/commit/84166fee9770e6fba71a96978b3e7d149392fb28 # noqa: E501 expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) @@ -1128,6 +1132,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe import fused_experts @@ -1639,6 +1644,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert self.kernel_backend == "Marlin" @@ -1888,6 +1894,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe import fused_experts @@ -2505,6 +2512,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: if layer.enable_eplb: diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 5a83e6360d2b..1046fbc3fe6f 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -10,6 +10,7 @@ FusedMoEConfig, FusedMoEMethodBase, RoutedExperts, + SharedExperts, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, @@ -140,6 +141,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe import fused_experts diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 4bc1869e5baa..fb265f20f060 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -24,6 +24,7 @@ FusedMoEMethodBase, FusedMoeWeightScaleSupported, RoutedExperts, + SharedExperts, UnquantizedFusedMoEMethod, ) from vllm.model_executor.layers.fused_moe.config import ( @@ -840,7 +841,6 @@ def _setup_kernel( fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, routing_tables=layer._maybe_init_expert_routing_tables(), - shared_experts=layer.shared_experts, ) def process_weights_after_loading(self, layer: RoutedExperts) -> None: @@ -964,6 +964,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert not self.is_monolithic @@ -978,6 +979,7 @@ def apply( global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 37ccd9ab28d1..1902f88d0e7b 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -13,17 +13,14 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( + FusedMoEConfig, FusedMoEMethodBase, - RoutedExperts, -) -from vllm.model_executor.layers.fused_moe.activation import ( + FusedMoEQuantConfig, MoEActivation, + RoutedExperts, + SharedExperts, apply_moe_activation, ) -from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEConfig, - FusedMoEQuantConfig, -) from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, @@ -636,6 +633,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: if layer.apply_router_weight_on_input: diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index cf050e07311c..24bfbafa7847 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -15,15 +15,14 @@ choose_mp_linear_kernel, ) from vllm.model_executor.layers.fused_moe import ( + FusedMoEConfig, FusedMoEMethodBase, + FusedMoEQuantConfig, FusedMoeWeightScaleSupported, RoutedExperts, + SharedExperts, UnquantizedFusedMoEMethod, ) -from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEConfig, - FusedMoEQuantConfig, -) from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs from vllm.model_executor.layers.quantization import QuantizationMethods @@ -899,6 +898,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: return fused_marlin_moe( diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index d955733d0358..e522170d3c4d 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -12,17 +12,14 @@ from vllm.model_executor.kernels.linear import init_fp8_linear_kernel from vllm.model_executor.layers.attention import Attention, MLAAttention from vllm.model_executor.layers.fused_moe import ( - FusedMoeWeightScaleSupported, - RoutedExperts, -) -from vllm.model_executor.layers.fused_moe.activation import MoEActivation -from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, + FusedMoEMethodBase, FusedMoEQuantConfig, + FusedMoeWeightScaleSupported, + MoEActivation, + RoutedExperts, RoutingMethodType, -) -from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( - FusedMoEMethodBase, + SharedExperts, ) from vllm.model_executor.layers.fused_moe.oracle.fp8 import ( Fp8MoeBackend, @@ -879,7 +876,6 @@ def _setup_kernel( fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, routing_tables=layer._maybe_init_expert_routing_tables(), - shared_experts=layer.shared_experts, ) def process_weights_after_loading(self, layer: RoutedExperts) -> None: @@ -956,6 +952,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert not self.is_monolithic @@ -970,6 +967,7 @@ def apply( global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) @@ -1391,7 +1389,6 @@ def process_weights_after_loading(self, layer: RoutedExperts) -> None: moe_quant_config=self.moe_quant_config, moe_config=self.moe, experts_cls=self.experts_cls, - shared_experts=layer.shared_experts, routing_tables=layer._maybe_init_expert_routing_tables(), ) self.moe_kernel.fused_experts.process_weights_after_loading(layer) @@ -1440,6 +1437,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert not self.is_monolithic @@ -1454,6 +1452,7 @@ def apply( global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) @@ -2033,6 +2032,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert not self.is_monolithic diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 03de2361ceb5..5710ff019f2f 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -11,6 +11,7 @@ FusedMoEMethodBase, FusedMoeWeightScaleSupported, RoutedExperts, + SharedExperts, ) from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( @@ -368,6 +369,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe import fused_experts diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 55fb1ede2258..b6ecce51e77d 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -9,12 +9,11 @@ from vllm.model_executor.layers.fused_moe import ( FusedMoEConfig, FusedMoEMethodBase, + FusedMoEQuantConfig, RoutedExperts, + SharedExperts, ) from vllm.model_executor.layers.fused_moe import modular_kernel as mk -from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEQuantConfig, -) from vllm.model_executor.layers.fused_moe.oracle.mxfp4 import ( TRITON_BACKENDS, Mxfp4MoeBackend, @@ -326,7 +325,6 @@ def _setup_kernel( mxfp4_backend=self.mxfp4_backend, experts_cls=self.experts_cls, routing_tables=layer._maybe_init_expert_routing_tables(), - shared_experts=layer.shared_experts, ) def process_weights_after_loading(self, layer: RoutedExperts) -> None: @@ -380,6 +378,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert not self.is_monolithic @@ -394,6 +393,7 @@ def apply( global_num_experts=layer.global_num_experts, apply_router_weight_on_input=layer.apply_router_weight_on_input, expert_map=layer.expert_map, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index c43dd2066303..021929db0c8b 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -16,6 +16,7 @@ FusedMoeWeightScaleSupported, MoEActivation, RoutedExperts, + SharedExperts, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, @@ -445,6 +446,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: if self.rocm_aiter_moe_enabled: @@ -634,6 +636,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( @@ -1042,6 +1045,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: if not self.emulate: diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 75cded8b7f0f..89e439d17271 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -41,9 +41,13 @@ def is_moe_layer(module: torch.nn.Module) -> bool: # presence of quant_method.maybe_init_modular_kernel? # return (hasattr(module, "quant_method") # and hasattr(module.quant_method, "moe_kernel")) - return ( - module.__class__.__name__ == "FusedMoE" - or module.__class__.__name__ == "SharedFusedMoE" - or module.__class__.__name__ == "DefaultMoERunner" - or module.__class__.__name__ == "ChunkingMoERunner" - ) + + def _check_bases(cls): + if cls.__name__ == "MoERunner": + return True + + for b in cls.__bases__: + if _check_bases(b): + return True + + return _check_bases(module.__class__) From 09abcbd108bafa014bc2eced372021e742431530 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 22 Apr 2026 18:43:34 +0000 Subject: [PATCH 102/191] fix annotation Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/__init__.py | 4 ---- vllm/model_executor/layers/fused_moe/runner/moe_runner.py | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index dec1c789a627..7b5ac476d7cf 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -37,9 +37,6 @@ from vllm.model_executor.layers.fused_moe.runner.moe_runner import ( MoERunner, ) -from vllm.model_executor.layers.fused_moe.runner.moe_runner_factory import ( - create_moe_runner, -) from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( SharedExperts, ) @@ -85,7 +82,6 @@ def get_config() -> dict[str, Any] | None: "UnquantizedFusedMoEMethod", "activation_without_mul", "apply_moe_activation", - "create_moe_runner", "fused_moe_make_expert_params_mapping", "get_config", "override_config", diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index 82c1119e721a..ff6e606ace14 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -686,7 +686,7 @@ def _maybe_combine( self, shared_output: torch.Tensor | None, hidden_states: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]: + ) -> torch.Tensor | tuple[torch.Tensor | None, torch.Tensor]: if self.do_naive_dispatch_combine: hidden_states = get_ep_group().combine( hidden_states, self.moe_config.is_sequence_parallel From f7b705de7a925c657059c27e02e2641909930d8e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 23 Apr 2026 01:05:42 +0000 Subject: [PATCH 103/191] revert some stuff Signed-off-by: Bill Nell --- .../configs/moe-refactor/LongCat-Flash-Chat-FP8.yaml | 10 ---------- tests/evals/gsm8k/configs/moe-refactor/config-h100.txt | 1 - 2 files changed, 11 deletions(-) delete mode 100644 tests/evals/gsm8k/configs/moe-refactor/LongCat-Flash-Chat-FP8.yaml diff --git a/tests/evals/gsm8k/configs/moe-refactor/LongCat-Flash-Chat-FP8.yaml b/tests/evals/gsm8k/configs/moe-refactor/LongCat-Flash-Chat-FP8.yaml deleted file mode 100644 index ca5c9a00ed37..000000000000 --- a/tests/evals/gsm8k/configs/moe-refactor/LongCat-Flash-Chat-FP8.yaml +++ /dev/null @@ -1,10 +0,0 @@ -model_name: "meituan-longcat/LongCat-Flash-Chat-FP8" -accuracy_threshold: 0.70 -num_questions: 1319 -num_fewshot: 5 -startup_max_wait_seconds: 1200 -server_args: >- - --enforce-eager - --max-model-len 4096 - --tensor-parallel-size 8 - --enable-expert-parallel diff --git a/tests/evals/gsm8k/configs/moe-refactor/config-h100.txt b/tests/evals/gsm8k/configs/moe-refactor/config-h100.txt index 912ba878fae6..7397fc4e4626 100644 --- a/tests/evals/gsm8k/configs/moe-refactor/config-h100.txt +++ b/tests/evals/gsm8k/configs/moe-refactor/config-h100.txt @@ -10,4 +10,3 @@ Qwen3-30B-A3B-Fp8-CT-Channel-marlin.yaml Qwen3-30B-A3B-Fp8-CT-Channel-vllm-cutlass.yaml Qwen3-30B-A3B-BF16-fi-cutlass.yaml Qwen3-30B-A3B-BF16-triton.yaml -LongCat-Flash-Chat-FP8.yaml From 2630038d96eb0c04945ea855de68888a4dabe15e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 23 Apr 2026 02:35:22 +0000 Subject: [PATCH 104/191] tweaks Signed-off-by: Bill Nell --- .../layers/fused_moe/runner/moe_runner_interface.py | 5 +++++ vllm/model_executor/warmup/deep_gemm_warmup.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_interface.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_interface.py index adee5944aed9..4fd7601a1e04 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_interface.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_interface.py @@ -48,6 +48,11 @@ def shared_experts(self) -> SharedExperts | None: def is_internal_router(self) -> bool: raise NotImplementedError + @property + @abstractmethod + def _quant_method(self) -> FusedMoEMethodBase: + raise NotImplementedError + # Temporary hack @abstractmethod def _replace_quant_method(self, quant_method: FusedMoEMethodBase): diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index e524c803447c..476f83a8ad6b 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -160,7 +160,7 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool: if not isinstance(module, MoERunner): return False - quant_method = module.routed_experts.quant_method + quant_method = module._quant_method moe_quant_config = quant_method.get_fused_moe_quant_config(module.routed_experts) if ( From ca58511624b5226ed808c18a44ff2206f4e066d9 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sat, 25 Apr 2026 01:07:43 +0000 Subject: [PATCH 105/191] trunc before Signed-off-by: Bill Nell --- .../layers/fused_moe/runner/moe_runner.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index d2fa8feba58c..c5be99b5698b 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -359,9 +359,8 @@ def _maybe_reduce_shared_expert_output( def _maybe_reduce_final_output( self, states: torch.Tensor, - trunc_size: int, ) -> torch.Tensor: - """Truncate padded dimensions and all-reduce the combined output. + """All-reduce the combined output if needed. This is the "late" all-reduce path. When neither fused nor shared output was individually reduced, the combined sum is all-reduced @@ -378,7 +377,7 @@ def _maybe_reduce_final_output( ): states = tensor_model_parallel_all_reduce(states) - return states[..., :trunc_size] + return states def _encode_layer_name(self) -> str | LayerName: if _USE_LAYERNAME: @@ -581,6 +580,9 @@ def forward( # Extract outputs from result shared_output, fused_output = _unpack(result) + # Remember 40794. Double check tests/lora/test_gpt_oss.py::test_gpt_oss_tp2 + fused_output = fused_output[:, :og_hidden_dim] + # If combine kernel already reduced fused, reduce shared to match. # See note above re: the two all-reduce points. shared_output = self._maybe_reduce_shared_expert_output(shared_output) @@ -597,7 +599,7 @@ def forward( else: result = fused_output - result = self._maybe_reduce_final_output(result, og_hidden_dim) + result = self._maybe_reduce_final_output(result) return self._maybe_add_zero_expert_output(result) From df142365759e91e1ef49c85b00dc0af95a306136 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sat, 25 Apr 2026 01:10:33 +0000 Subject: [PATCH 106/191] fix Signed-off-by: Bill Nell --- vllm/model_executor/warmup/deep_gemm_warmup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index 476f83a8ad6b..7d9af774e810 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -12,8 +12,8 @@ import vllm.envs as envs from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank from vllm.model_executor.layers.fused_moe import MoERunner -from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M +from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import DeepGemmExperts from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts, ) From 9fbd5b7f3c9a25261dfa87a737d8776164434824 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 23 Apr 2026 20:34:07 +0000 Subject: [PATCH 107/191] don't store SharedExperts in MK Signed-off-by: Bill Nell --- vllm/lora/layers/fused_moe.py | 3 - .../layers/fused_moe/__init__.py | 13 ++++ .../layers/fused_moe/experts/cutlass_moe.py | 1 - .../layers/fused_moe/fused_moe_method_base.py | 11 ++- .../fused_moe/fused_moe_modular_method.py | 4 +- vllm/model_executor/layers/fused_moe/layer.py | 1 - .../layers/fused_moe/modular_kernel.py | 29 ++++---- .../layers/fused_moe/oracle/fp8.py | 8 -- .../layers/fused_moe/oracle/int8.py | 5 -- .../layers/fused_moe/oracle/int_wna16.py | 2 - .../layers/fused_moe/oracle/mxfp4.py | 6 -- .../layers/fused_moe/oracle/nvfp4.py | 8 -- .../layers/fused_moe/oracle/unquantized.py | 5 -- .../layers/fused_moe/runner/moe_runner.py | 1 + .../layers/fused_moe/runner/shared_experts.py | 2 +- .../fused_moe/unquantized_fused_moe_method.py | 22 ++++-- .../model_executor/layers/quantization/awq.py | 4 +- .../layers/quantization/awq_marlin.py | 24 +++--- .../layers/quantization/bitsandbytes.py | 18 ++--- .../compressed_tensors/compressed_tensors.py | 12 +-- .../compressed_tensors_moe.py | 2 +- .../compressed_tensors_moe_w4a4_mxfp4.py | 10 ++- .../compressed_tensors_moe_w4a4_nvfp4.py | 12 +-- .../compressed_tensors_moe_w4a8_fp8.py | 8 +- .../compressed_tensors_moe_w4a8_int8.py | 4 +- .../compressed_tensors_moe_w8a8_fp8.py | 18 +++-- .../compressed_tensors_moe_w8a8_int8.py | 12 +-- .../compressed_tensors_moe_w8a8_mxfp8.py | 12 +-- .../compressed_tensors_moe_wna16.py | 6 +- .../compressed_tensors_moe_wna16_marlin.py | 8 +- .../layers/quantization/experts_int8.py | 8 +- .../model_executor/layers/quantization/fp8.py | 46 ++++++------ .../layers/quantization/gguf.py | 24 +++--- .../layers/quantization/gptq.py | 4 +- .../layers/quantization/gptq_marlin.py | 56 +++++++------- .../model_executor/layers/quantization/inc.py | 11 ++- .../layers/quantization/modelopt.py | 74 +++++++++---------- .../layers/quantization/moe_wna16.py | 24 +++--- .../layers/quantization/mxfp4.py | 32 ++++---- .../layers/quantization/online/base.py | 4 +- .../layers/quantization/online/fp8.py | 5 +- .../layers/quantization/online/int8.py | 5 +- .../layers/quantization/online/moe_base.py | 18 +++-- .../layers/quantization/online/mxfp8.py | 7 +- .../layers/quantization/quark/quark.py | 4 +- .../layers/quantization/quark/quark_moe.py | 55 +++++++------- .../quantization/utils/flashinfer_fp4_moe.py | 6 +- .../layers/quantization/utils/marlin_utils.py | 5 +- 48 files changed, 339 insertions(+), 320 deletions(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 284ac54997fb..5e918b9399c8 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -47,9 +47,6 @@ def __init__(self, base_layer: FusedMoE) -> None: self.base_layer.ensure_moe_quant_config_init() if getattr(self.base_layer.quant_method, "supports_internal_mk", False): moe_kernel = self.base_layer.quant_method.moe_kernel - # Don't let the kernel own shared experts so the runner can - # overlap them with routed experts via a separate CUDA stream. - moe_kernel.shared_experts = None else: prepare_finalize = MoEPrepareAndFinalizeNoDPEPModular() moe_kernel = FusedMoEKernel( diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 75a9faddc1f0..a7d0229048ce 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -11,6 +11,8 @@ ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, + FusedMoEParallelConfig, + FusedMoEQuantConfig, RoutingMethodType, ) from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( @@ -30,6 +32,9 @@ FusedMoERouter, ) from vllm.model_executor.layers.fused_moe.router.gate_linear import GateLinear +from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( + SharedExperts, +) from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( UnquantizedFusedMoEMethod, ) @@ -38,6 +43,10 @@ _config: dict[str, Any] | None = None +# Temporary alias for FusedMoE, eventually we be its own class. +RoutedExperts = FusedMoE + + @contextmanager def override_config(config): global _config @@ -55,6 +64,8 @@ def get_config() -> dict[str, Any] | None: "FusedMoE", "FusedMoERouter", "FusedMoEConfig", + "FusedMoEQuantConfig", + "FusedMoEParallelConfig", "FusedMoEMethodBase", "MoEActivation", "UnquantizedFusedMoEMethod", @@ -64,6 +75,8 @@ def get_config() -> dict[str, Any] | None: "FusedMoEPrepareAndFinalizeModular", "GateLinear", "RoutingMethodType", + "RoutedExperts", + "SharedExperts", "activation_without_mul", "apply_moe_activation", "fused_moe_make_expert_params_mapping", diff --git a/vllm/model_executor/layers/fused_moe/experts/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/experts/cutlass_moe.py index fdd802e7da3a..c1274a3c14d5 100644 --- a/vllm/model_executor/layers/fused_moe/experts/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/cutlass_moe.py @@ -1489,7 +1489,6 @@ def cutlass_moe_w4a8_fp8( quant_config=quant_config, group_size=group_size, ), - shared_experts=None, inplace=False, ) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py index 7d54e2b717d6..85189e873666 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import abstractmethod +from typing import TYPE_CHECKING import torch @@ -20,6 +21,9 @@ QuantizeMethodBase, ) +if TYPE_CHECKING: + from vllm.model_executor.layers.runner.shared_experts import SharedExperts + logger = init_logger(__name__) @@ -37,10 +41,12 @@ def supports_internal_mk(self) -> bool: return self.moe_kernel is not None @property - def mk_owns_shared_expert(self) -> bool: + def mk_can_overlap_shared_experts(self) -> bool: # NOTE(rob): temporary attribute to indicate support for # completed migration to the new internal MK interface. - return self.moe_kernel is not None and self.moe_kernel.owns_shared_experts + return ( + self.moe_kernel is not None and self.moe_kernel.can_overlap_shared_experts + ) @abstractmethod def create_weights( @@ -160,6 +166,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: "SharedExperts" | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: raise NotImplementedError diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index 142e180786c6..42446c4876a0 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -47,7 +47,6 @@ def make( moe_layer: torch.nn.Module, old_quant_method: FusedMoEMethodBase, prepare_finalize: FusedMoEPrepareAndFinalizeModular, - shared_experts: SharedExperts | None, inplace: bool = False, ) -> "FusedMoEModularMethod": return FusedMoEModularMethod( @@ -55,7 +54,6 @@ def make( FusedMoEKernel( prepare_finalize, old_quant_method.select_gemm_impl(prepare_finalize, moe_layer), - shared_experts=shared_experts, inplace=inplace, ), ) @@ -90,6 +88,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert self.moe_kernel is not None @@ -103,5 +102,6 @@ def apply( global_num_experts=layer.global_num_experts, apply_router_weight_on_input=layer.apply_router_weight_on_input, expert_map=None if self.disable_expert_map else layer.expert_map, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 7174cdd88f25..da71a581230e 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -644,7 +644,6 @@ def maybe_init_modular_kernel(self) -> None: self, self.base_quant_method, prepare_finalize, - self.shared_experts, inplace=not self.moe_config.disable_inplace, ) ) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index b0f967085ae4..a50c969ceb2e 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -1018,17 +1018,10 @@ def __init__( self, prepare_finalize: FusedMoEPrepareAndFinalizeModular, fused_experts: FusedMoEExpertsModular, - shared_experts: SharedExperts | None, inplace: bool = False, ): self.prepare_finalize = prepare_finalize self.fused_experts = fused_experts - # Only accept shared experts if they can be run w/async. - # The MoERunner/SharedExperts class will coordinate with the MK to ensure - # that the SharedExperts are executed only once. - self.shared_experts = ( - shared_experts if prepare_finalize.supports_async() else None - ) self.inplace = inplace moe_parallel_config = fused_experts.moe_config.moe_parallel_config self.moe_parallel_config = moe_parallel_config @@ -1103,11 +1096,12 @@ def _allocate_buffers( def _maybe_apply_shared_experts( self, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ): - if self.shared_experts is not None: + if shared_experts is not None: assert shared_experts_input is not None - self.shared_experts.apply( + shared_experts.apply( shared_experts_input, SharedExpertsOrder.MK_INTERNAL_OVERLAPPED, ) @@ -1271,6 +1265,7 @@ def _finalize( topk_weights: torch.Tensor, topk_ids: torch.Tensor, apply_router_weight_on_input: bool, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: """ @@ -1278,6 +1273,7 @@ def _finalize( that handles DBO, async and shared expert overlap. Args: + shared_experts: SharedExperts | None. The shared experts if any. shared_experts_input: Optional separate input for shared experts. When latent MoE is used, hidden_states is the latent-projected tensor (smaller dimension) used by routed experts, while @@ -1304,7 +1300,7 @@ def _finalize( apply_router_weight_on_input, self.fused_experts.finalize_weight_and_reduce_impl(), ) - self._maybe_apply_shared_experts(shared_experts_input) + self._maybe_apply_shared_experts(shared_experts, shared_experts_input) # TODO(lucas): refactor this in the alternative schedules followup # currently unpack if we have hook + receiver pair or just @@ -1340,6 +1336,7 @@ def apply( global_num_experts: int = -1, expert_map: torch.Tensor | None = None, apply_router_weight_on_input: bool = False, + shared_experts: SharedExperts | None = None, shared_experts_input: torch.Tensor | None = None, ) -> torch.Tensor: """ @@ -1362,6 +1359,7 @@ def apply( - apply_router_weight_on_input (bool): When true, the topk weights are applied directly on the inputs. This is only applicable when topk is 1. + - shared_experts: SharedExperts | None. The shared experts if any. - shared_experts_input (Optional[torch.Tensor]): Optional separate input for shared experts. For latent MoE, this is the original hidden_states before latent projection. @@ -1370,7 +1368,7 @@ def apply( - torch.Tensor: The output tensor after applying the MoE layer. """ if self.inplace: - assert self.shared_experts is None + assert shared_experts is None assert not disable_inplace() output = hidden_states else: @@ -1412,6 +1410,7 @@ def apply( topk_weights, topk_ids, apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) @@ -1484,7 +1483,6 @@ def __init__( self, prepare_finalize: FusedMoEPrepareAndFinalize, fused_experts: FusedMoEExperts, - shared_experts: SharedExperts | None = None, inplace: bool = False, ): super().__init__() @@ -1497,7 +1495,6 @@ def __init__( self.impl = FusedMoEKernelModularImpl( prepare_finalize, fused_experts, - shared_experts, inplace, ) @@ -1520,9 +1517,9 @@ def __init__( self._post_init_setup() @property - def owns_shared_experts(self) -> bool: + def can_overlap_shared_experts(self) -> bool: if isinstance(self.impl, FusedMoEKernelModularImpl): - return self.impl.shared_experts is not None + return self.impl.prepare_finalize.supports_async() else: return False @@ -1608,6 +1605,7 @@ def apply( global_num_experts: int, expert_map: torch.Tensor | None, apply_router_weight_on_input: bool, + shared_experts: SharedExperts | None = None, shared_experts_input: torch.Tensor | None = None, ) -> torch.Tensor: assert isinstance(self.impl, FusedMoEKernelModularImpl) @@ -1621,5 +1619,6 @@ def apply( global_num_experts=global_num_experts, expert_map=expert_map, apply_router_weight_on_input=apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py index 74be17eaa55f..cb687b10536a 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py +++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py @@ -18,9 +18,6 @@ fp8_w8a8_moe_quant_config, fp8_w8a16_moe_quant_config, ) -from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( - SharedExperts, -) from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( FlashinferMoeBackend, get_flashinfer_moe_backend, @@ -563,7 +560,6 @@ def make_fp8_moe_kernel( experts_cls: type[mk.FusedMoEExperts], fp8_backend: Fp8MoeBackend, routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, - shared_experts: SharedExperts | None = None, ) -> mk.FusedMoEKernel: # Create Prepare/Finalize. prepare_finalize = maybe_make_prepare_finalize( @@ -593,13 +589,9 @@ def make_fp8_moe_kernel( quant_config=moe_quant_config, ) - # NOTE(rob): we only want the mk to control the shared_expert - # if using all2all (for SBO). bnell is making this explicit in - # the new MoE runner class. kernel = mk.FusedMoEKernel( prepare_finalize, experts, - shared_experts=shared_experts, inplace=( not moe_config.disable_inplace and fp8_backend != Fp8MoeBackend.FLASHINFER_CUTLASS diff --git a/vllm/model_executor/layers/fused_moe/oracle/int8.py b/vllm/model_executor/layers/fused_moe/oracle/int8.py index cdb1be108b5d..e74c329d8369 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/int8.py +++ b/vllm/model_executor/layers/fused_moe/oracle/int8.py @@ -17,9 +17,6 @@ int8_w8a8_moe_quant_config, int8_w8a16_moe_quant_config, ) -from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( - SharedExperts, -) from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, kInt8DynamicTokenSym, @@ -181,7 +178,6 @@ def make_int8_moe_kernel( moe_config: FusedMoEConfig, experts_cls: type[mk.FusedMoEExperts], routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, - shared_experts: SharedExperts | None = None, ) -> mk.FusedMoEKernel: # Create Prepare/Finalize. prepare_finalize = maybe_make_prepare_finalize( @@ -214,7 +210,6 @@ def make_int8_moe_kernel( kernel = mk.FusedMoEKernel( prepare_finalize, experts, - shared_experts=shared_experts, inplace=not moe_config.disable_inplace, ) diff --git a/vllm/model_executor/layers/fused_moe/oracle/int_wna16.py b/vllm/model_executor/layers/fused_moe/oracle/int_wna16.py index 5503d233f128..7945a523dda4 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/int_wna16.py +++ b/vllm/model_executor/layers/fused_moe/oracle/int_wna16.py @@ -154,7 +154,6 @@ def make_wna16_moe_kernel( w13_g_idx_sort_indices: torch.Tensor | None, w2_g_idx_sort_indices: torch.Tensor | None, routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, - shared_experts: torch.nn.Module | None = None, ) -> mk.FusedMoEKernel: # Currently, we only support MarlinExperts and BatchedMarlinExperts assert experts_cls in (MarlinExperts, BatchedMarlinExperts) @@ -202,7 +201,6 @@ def make_wna16_moe_kernel( return mk.FusedMoEKernel( prepare_finalize, experts, - shared_experts=shared_experts, inplace=not moe_config.disable_inplace, ) diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py index f476d980d555..e5f5e2ecfccc 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py +++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py @@ -1245,7 +1245,6 @@ def make_mxfp4_moe_kernel( experts_cls: type[mk.FusedMoEExperts], mxfp4_backend: Mxfp4MoeBackend, routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, - shared_experts: torch.nn.Module | None = None, ) -> mk.FusedMoEKernel: """Create a FusedMoEKernel for the given MXFP4 backend.""" is_monolithic = issubclass(experts_cls, mk.FusedMoEExpertsMonolithic) @@ -1281,11 +1280,6 @@ def make_mxfp4_moe_kernel( kernel = mk.FusedMoEKernel( prepare_finalize, experts, - shared_experts=( - shared_experts - if moe_config.moe_parallel_config.use_batched_activation_format - else None - ), inplace=( not moe_config.disable_inplace and mxfp4_backend not in TRTLLM_BACKENDS ), diff --git a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py index db6d56e3c3ac..39d404a3d4a5 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py +++ b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py @@ -17,9 +17,6 @@ nvfp4_moe_quant_config, nvfp4_w4a16_moe_quant_config, ) -from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( - SharedExperts, -) from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( prepare_nvfp4_moe_layer_for_fi_or_cutlass, prepare_nvfp4_moe_layer_for_flashinfer_cutedsl, @@ -469,7 +466,6 @@ def make_nvfp4_moe_kernel( moe_config: FusedMoEConfig, experts_cls: type[mk.FusedMoEExperts], routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, - shared_experts: SharedExperts | None = None, ) -> mk.FusedMoEKernel: # Create Prepare/Finalize. prepare_finalize = maybe_make_prepare_finalize( @@ -499,13 +495,9 @@ def make_nvfp4_moe_kernel( quant_config=moe_quant_config, ) - # NOTE(rob): we only want the mk to control the shared_expert - # if using all2all (for SBO). bnell is making this explicit in - # the new MoE runner class. kernel = mk.FusedMoEKernel( prepare_finalize, experts, - shared_experts=shared_experts, inplace=False, ) diff --git a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py index 8240a5e8c963..f98881494699 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py +++ b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py @@ -18,9 +18,6 @@ FusedMoEConfig, FusedMoEQuantConfig, ) -from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( - SharedExperts, -) from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( FlashinferMoeBackend, convert_moe_weights_to_flashinfer_trtllm_block_layout, @@ -327,7 +324,6 @@ def make_unquantized_moe_kernel( backend: UnquantizedMoeBackend, experts_cls: type[mk.FusedMoEExperts], routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, - shared_experts: SharedExperts | None = None, ) -> mk.FusedMoEKernel: # Create Prepare/Finalize is_monolithic = issubclass(experts_cls, mk.FusedMoEExpertsMonolithic) @@ -361,7 +357,6 @@ def make_unquantized_moe_kernel( kernel = mk.FusedMoEKernel( prepare_finalize, experts, - shared_experts=shared_experts, inplace=(not moe_config.disable_inplace and not is_monolithic), ) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index bf8641b060f2..28f55b7419bc 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -472,6 +472,7 @@ def _apply_quant_method( x=hidden_states, topk_weights=topk_weights, topk_ids=topk_ids, + shared_experts=self._shared_experts, shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index 227014e23973..e6492cef61c6 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -96,7 +96,7 @@ def _determine_shared_experts_order( if self._disable_shared_experts_overlap: return SharedExpertsOrder.NO_OVERLAP - if self._quant_method.mk_owns_shared_expert: + if self._quant_method.mk_can_overlap_shared_experts: return SharedExpertsOrder.MK_INTERNAL_OVERLAPPED should_run_shared_in_aux_stream = ( diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 89697033403d..c3e78bc4e768 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable +from typing import TYPE_CHECKING import torch import torch.nn.functional as F @@ -29,10 +30,16 @@ make_unquantized_moe_kernel, select_unquantized_moe_backend, ) +from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( + SharedExperts, +) from vllm.model_executor.utils import replace_parameter, set_weight_attrs from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum +if TYPE_CHECKING: + from vllm.model_executor.layers.fused_moe import RoutedExperts + logger = init_logger(__name__) @@ -173,7 +180,6 @@ def _setup_kernel( backend=self.unquantized_backend, experts_cls=self.experts_cls, routing_tables=layer._maybe_init_expert_routing_tables(), - shared_experts=layer.shared_experts, ) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: @@ -252,10 +258,11 @@ def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantCon def apply( self, - layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: return self.forward( @@ -263,15 +270,17 @@ def apply( x=x, topk_weights=topk_weights, topk_ids=topk_ids, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) def forward_native( self, - layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert self.moe_kernel is not None @@ -285,15 +294,17 @@ def forward_native( apply_router_weight_on_input=layer.apply_router_weight_on_input, global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) def forward_cuda( self, - layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: return self.forward_native( @@ -301,12 +312,13 @@ def forward_cuda( x, topk_weights, topk_ids, + shared_experts=shared_experts, shared_experts_input, ) def apply_monolithic( self, - layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", x: torch.Tensor, router_logits: torch.Tensor, input_ids: torch.Tensor | None = None, diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 37cffcb3da26..28c02e54d536 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -10,7 +10,7 @@ from vllm import _custom_ops as ops from vllm import envs from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.layer import FusedMoE +from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, @@ -106,7 +106,7 @@ def get_quant_method( ): return UnquantizedLinearMethod() return AWQLinearMethod(self) - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): # Lazy import to avoid circular import. from .awq_marlin import AWQMarlinConfig from .moe_wna16 import MoeWNA16Config diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index cfad1f86faa2..2f323e372e82 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -16,17 +16,16 @@ MPLinearLayerConfig, choose_mp_linear_kernel, ) -from vllm.model_executor.layers.fused_moe.config import ( +from vllm.model_executor.layers.fused_moe import ( FusedMoEConfig, - FusedMoEQuantConfig, -) -from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe -from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, FusedMoEMethodBase, + FusedMoEQuantConfig, FusedMoeWeightScaleSupported, + RoutedExperts, + SharedExperts, UnquantizedFusedMoEMethod, ) +from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, @@ -286,7 +285,7 @@ def get_quant_method( quant_method = AWQMarlinLinearMethod(self) quant_method.input_dtype = get_marlin_input_dtype(prefix) return quant_method - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config if is_layer_skipped( @@ -507,7 +506,7 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -606,7 +605,7 @@ def create_weights( device = layer.w13_qweight.device layer.workspace = marlin_make_workspace_new(device, 4) - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: num_experts = layer.w13_qweight.shape[0] device = layer.w13_qweight.device is_a_8bit = self.input_dtype is not None and self.input_dtype.itemsize == 1 @@ -723,7 +722,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.w2_bias.data = marlin_permute_bias(layer.w2_bias) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: from vllm.model_executor.layers.fused_moe.config import ( awq_marlin_moe_quant_config, @@ -747,7 +746,7 @@ def get_fused_moe_quant_config( def select_gemm_impl( self, prepare_finalize, - layer: torch.nn.Module, + layer: RoutedExperts, ): """ Select the GEMM implementation for AWQ-Marlin MoE. @@ -812,10 +811,11 @@ def select_gemm_impl( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: return fused_marlin_moe( diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 729924663646..a32ed3196c1a 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -6,13 +6,12 @@ import torch from packaging import version -from vllm.model_executor.layers.fused_moe.config import ( +from vllm.model_executor.layers.fused_moe import ( FusedMoEConfig, - FusedMoEQuantConfig, -) -from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, FusedMoEMethodBase, + FusedMoEQuantConfig, + RoutedExperts, + SharedExperts, ) from vllm.model_executor.layers.linear import ( LinearBase, @@ -164,7 +163,7 @@ def get_quant_method( if is_layer_skipped_bnb(prefix, self.llm_int8_skip_modules): return UnquantizedLinearMethod() return BitsAndBytesLinearMethod(self) - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): return BitsAndBytesMoEMethod(self, layer.moe_config) return None @@ -451,7 +450,7 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -472,16 +471,17 @@ def create_weights( ) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: return None def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe import fused_experts diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 8d16a143b10a..92ef38462905 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -24,7 +24,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import RoutedExperts from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, @@ -192,25 +192,25 @@ def get_quant_method( if isinstance(layer, Attention): return CompressedTensorsKVCacheMethod(self) - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): return CompressedTensorsMoEMethod.get_moe_method( self, layer, layer_name=prefix ) return None - def _add_fused_moe_to_target_scheme_map(self): + def _add_fused_moe_to_target_scheme_map(self): # XXXXXXXXXXXXXXXXXXXXXX """ Helper function to update target_scheme_map since linear layers get fused into FusedMoE targeting 'Linear' needs to also match - FusedMoE modules. + RoutedExperts modules. """ if ( "Linear" not in self.target_scheme_map - or "FusedMoE" in self.target_scheme_map + or "RoutedExperts" in self.target_scheme_map ): return - self.target_scheme_map["FusedMoE"] = self.target_scheme_map["Linear"] + self.target_scheme_map["RoutedExperts"] = self.target_scheme_map["Linear"] @classmethod def from_config(cls, config: dict[str, Any]) -> "CompressedTensorsConfig": diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe.py index f25b8af1d6b3..a2e941621927 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe.py @@ -32,7 +32,7 @@ def get_moe_method( layer: torch.nn.Module, layer_name: str, ) -> FusedMoEMethodBase: - # FusedMoE was made by combining multiple Linears so need to + # RoutedExperts was made by combining multiple Linears so need to # make sure quantization config for Linear can target it quant_config._add_fused_moe_to_target_scheme_map() unfused_names = [ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py index 629e1c5ef1be..fa1cf87c074b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py @@ -7,8 +7,9 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( - FusedMoE, FusedMoeWeightScaleSupported, + RoutedExperts, + SharedExperts, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, @@ -135,7 +136,7 @@ def get_fused_moe_quant_config( w2_scale=layer.w2_weight_scale, ) - def process_weights_after_loading(self, layer: FusedMoE) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: layer.w13_weight = torch.nn.Parameter( layer.w13_weight_packed.data, requires_grad=False ) @@ -193,16 +194,16 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: moe_config=self.moe, experts_cls=self.experts_cls, mxfp4_backend=self.mxfp4_backend, - shared_experts=layer.shared_experts, routing_tables=layer._maybe_init_expert_routing_tables(), ) def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert self.moe_kernel is not None @@ -216,5 +217,6 @@ def apply( global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_nvfp4.py index 29c673d0f6e3..9448147aa6a9 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_nvfp4.py @@ -7,8 +7,9 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( - FusedMoE, FusedMoeWeightScaleSupported, + RoutedExperts, + SharedExperts, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, @@ -167,7 +168,7 @@ def create_weights( ) set_weight_attrs(w2_input_scale, extra_weight_attrs) - def process_weights_after_loading(self, layer: FusedMoE) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: """ Convert NVFP4 MoE weights into kernel format and setup the kernel. """ @@ -235,7 +236,6 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: moe_quant_config=self.moe_quant_config, moe_config=self.moe, experts_cls=self.experts_cls, - shared_experts=layer.shared_experts, routing_tables=layer._maybe_init_expert_routing_tables(), ) self.moe_kernel.fused_experts.process_weights_after_loading(layer) @@ -262,7 +262,7 @@ def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantCon def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, input_ids: torch.Tensor | None = None, @@ -286,10 +286,11 @@ def apply_monolithic( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert self.moe_kernel is not None @@ -303,5 +304,6 @@ def apply( global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_fp8.py index b14571fe5013..f772807b6534 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_fp8.py @@ -11,10 +11,11 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( - FusedMoE, FusedMoEActivationFormat, FusedMoEExpertsModular, FusedMoeWeightScaleSupported, + RoutedExperts, + SharedExperts, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, @@ -136,7 +137,7 @@ def create_weights( requires_grad=False, ) layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Add PER-GROUP quantization for FusedMoE.weight_loader. + # Add PER-GROUP quantization for RoutedExperts.weight_loader. extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.GROUP.value} ) @@ -303,10 +304,11 @@ def select_gemm_impl( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: if layer.enable_eplb: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_int8.py index 88cdbadd3f83..400dffeb6dee 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_int8.py @@ -10,7 +10,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( - FusedMoE, + RoutedExperts, ) from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( @@ -302,7 +302,7 @@ def is_monolithic(self) -> bool: def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, input_ids: torch.Tensor | None = None, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_fp8.py index bba7e0e7abce..3b9224072f75 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_fp8.py @@ -12,8 +12,9 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( - FusedMoE, FusedMoeWeightScaleSupported, + RoutedExperts, + SharedExperts, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, @@ -191,7 +192,7 @@ def create_weights( torch.ones(num_experts, dtype=torch.float32), requires_grad=False ) layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Add PER-TENSOR quantization for FusedMoE.weight_loader. + # Add PER-TENSOR quantization for RoutedExperts.weight_loader. extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} ) @@ -214,7 +215,7 @@ def create_weights( requires_grad=False, ) layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Add PER-CHANNEL quantization for FusedMoE.weight_loader. + # Add PER-CHANNEL quantization for RoutedExperts.weight_loader. extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} ) @@ -243,7 +244,7 @@ def create_weights( requires_grad=False, ) layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Add PER-CHANNEL quantization for FusedMoE.weight_loader. + # Add PER-CHANNEL quantization for RoutedExperts.weight_loader. extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value} ) @@ -267,7 +268,7 @@ def create_weights( layer.w13_input_scale = None layer.w2_input_scale = None - def process_weights_after_loading(self, layer: FusedMoE) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: # Allow for accessing weights and scales in standard way. w13 = layer.w13_weight w2 = layer.w2_weight @@ -337,7 +338,6 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, routing_tables=layer._maybe_init_expert_routing_tables(), - shared_experts=layer.shared_experts, ) def maybe_make_prepare_finalize( @@ -364,7 +364,7 @@ def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantCon def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, input_ids: torch.Tensor | None = None, @@ -387,10 +387,11 @@ def apply_monolithic( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert not self.is_monolithic @@ -407,6 +408,7 @@ def apply( # https://github.com/vllm-project/vllm/commit/84166fee9770e6fba71a96978b3e7d149392fb28 # noqa: E501 expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_int8.py index bad5b3895b8f..cafc2d80f997 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_int8.py @@ -11,8 +11,9 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( - FusedMoE, FusedMoeWeightScaleSupported, + RoutedExperts, + SharedExperts, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, @@ -128,7 +129,7 @@ def create_weights( requires_grad=False, ) layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Add PER-CHANNEL quantization for FusedMoE.weight_loader. + # Add PER-CHANNEL quantization for RoutedExperts.weight_loader. extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} ) @@ -140,7 +141,7 @@ def create_weights( layer.w13_input_scale = None layer.w2_input_scale = None - def process_weights_after_loading(self, layer: FusedMoE) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: self.moe_quant_config = self.get_fused_moe_quant_config(layer) assert self.experts_cls is not None self.moe_kernel = make_int8_moe_kernel( @@ -148,7 +149,6 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: moe_config=self.moe, experts_cls=self.experts_cls, routing_tables=layer._maybe_init_expert_routing_tables(), - shared_experts=layer.shared_experts, ) def maybe_make_prepare_finalize( @@ -171,10 +171,11 @@ def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantCon def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert not self.is_monolithic @@ -189,5 +190,6 @@ def apply( global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_mxfp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_mxfp8.py index ecd0b54890d1..94def152f500 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_mxfp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_mxfp8.py @@ -5,8 +5,9 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.model_executor.layers.fused_moe import ( - FusedMoE, FusedMoeWeightScaleSupported, + RoutedExperts, + SharedExperts, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, @@ -111,7 +112,7 @@ def create_weights( layer.w13_input_scale = None layer.w2_input_scale = None - def process_weights_after_loading(self, layer: FusedMoE) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: layer.weight_block_size = self.weight_block_size w13, w2, w13_scale, w2_scale = convert_to_fp8_moe_kernel_format( @@ -139,7 +140,6 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, routing_tables=layer._maybe_init_expert_routing_tables(), - shared_experts=layer.shared_experts, ) def get_fused_moe_quant_config( @@ -165,7 +165,7 @@ def maybe_make_prepare_finalize( def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, input_ids: torch.Tensor | None = None, @@ -188,10 +188,11 @@ def apply_monolithic( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert not self.is_monolithic @@ -206,5 +207,6 @@ def apply( global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_wna16.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_wna16.py index f530a1a1df2b..f6418194fd70 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_wna16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_wna16.py @@ -10,7 +10,8 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( - FusedMoE, + RoutedExperts, + SharedExperts, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, @@ -240,10 +241,11 @@ def select_gemm_impl( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe import fused_experts diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_wna16_marlin.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_wna16_marlin.py index 8f86e687b7f6..deac6ed4b892 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_wna16_marlin.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_wna16_marlin.py @@ -13,7 +13,8 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( - FusedMoE, + RoutedExperts, + SharedExperts, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, @@ -514,7 +515,7 @@ def is_monolithic(self) -> bool: def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, input_ids: torch.Tensor | None = None, @@ -540,10 +541,11 @@ def apply_monolithic( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert self.kernel_backend == "Marlin" diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 007c3c214eed..8c8ec8d1dd59 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -5,7 +5,9 @@ import torch -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + RoutedExperts, +) from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( @@ -53,6 +55,6 @@ def get_quant_method( ) -> "QuantizeMethodBase | None": if isinstance(layer, LinearBase): return UnquantizedLinearMethod() - elif isinstance(layer, FusedMoE): - return Int8OnlineMoEMethod(layer=layer) + elif isinstance(layer, RoutedExperts): + return Int8OnlineMoEMethod(layer=layer.moe_config) return None diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 1c9237d3f60a..8d10c7c5d6b3 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -4,7 +4,6 @@ from typing import TYPE_CHECKING, Any import torch -from torch.nn import Module from torch.utils._python_dispatch import TorchDispatchMode import vllm.envs as envs @@ -19,14 +18,16 @@ from vllm.model_executor.kernels.linear.scaled_mm import MarlinFP8ScaledMMLinearKernel from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import ( - FusedMoE, + FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported, + RoutedExperts, + SharedExperts, + UnquantizedFusedMoEMethod, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, ) -from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod from vllm.model_executor.layers.fused_moe.oracle.fp8 import ( convert_to_fp8_moe_kernel_format, make_fp8_moe_kernel, @@ -187,7 +188,7 @@ def get_quant_method( offline_method = Fp8LinearMethod(self) offline_method.marlin_input_dtype = get_marlin_input_dtype(prefix) return offline_method - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): if is_layer_skipped( prefix=prefix, ignored_layers=self.ignored_layers, @@ -195,9 +196,9 @@ def get_quant_method( ): return UnquantizedFusedMoEMethod(layer.moe_config) if self.is_checkpoint_fp8_serialized: - moe_quant_method = Fp8MoEMethod(self, layer) + moe_quant_method = Fp8MoEMethod(self, layer.moe_config) else: - moe_quant_method = Fp8OnlineMoEMethod(self, layer) + moe_quant_method = Fp8OnlineMoEMethod(self, layer.moe_config) return moe_quant_method elif isinstance(layer, Attention): return Fp8KVCacheMethod(self) @@ -311,7 +312,7 @@ def __init__(self, quant_config: Fp8Config): def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, input_size_per_partition: int, output_partition_sizes: list[int], input_size: int, @@ -385,7 +386,7 @@ def create_weights( self.use_marlin = isinstance(self.fp8_linear, MarlinFP8ScaledMMLinearKernel) - def process_weights_after_loading(self, layer: Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: if self.use_marlin: # Only Marlin kernels support `marlin_input_dtype`; guard to avoid # AttributeError if backend selection changes. @@ -527,7 +528,7 @@ def create_weights( ) self.use_marlin = isinstance(self.fp8_linear, MarlinFP8ScaledMMLinearKernel) - def process_weights_after_loading(self, layer: Module) -> None: + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if getattr(layer, "_already_called_process_weights_after_loading", False): return @@ -568,8 +569,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): quant_config: The quantization config. """ - def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module): - super().__init__(layer.moe_config) + def __init__(self, quant_config: Fp8Config, moe_config: FusedMoEConfig): + super().__init__(moe_config) self.quant_config = quant_config self.weight_block_size = self.quant_config.weight_block_size self.block_quant: bool = self.weight_block_size is not None @@ -599,7 +600,7 @@ def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module): def create_weights( self, - layer: Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -739,7 +740,7 @@ def create_weights( def _setup_kernel( self, - layer: FusedMoE, + layer: RoutedExperts, w13: torch.Tensor, w2: torch.Tensor, w13_scale: torch.Tensor, @@ -775,10 +776,9 @@ def _setup_kernel( fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, routing_tables=layer._maybe_init_expert_routing_tables(), - shared_experts=layer.shared_experts, ) - def process_weights_after_loading(self, layer: Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: # Allow for accessing weights and scales in standard way. w13 = layer.w13_weight w2 = layer.w2_weight @@ -832,7 +832,7 @@ def maybe_make_prepare_finalize( "logic. This function should not be called." ) - def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig: + def get_fused_moe_quant_config(self, layer: RoutedExperts) -> FusedMoEQuantConfig: w1_scale = getattr(layer, f"w13_{self.weight_scale_name}") w2_scale = getattr(layer, f"w2_{self.weight_scale_name}") a1_scale = layer.w13_input_scale @@ -865,7 +865,7 @@ def supports_eplb(self) -> bool: def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, input_ids: torch.Tensor | None = None, @@ -889,10 +889,11 @@ def apply_monolithic( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert not self.is_monolithic @@ -907,6 +908,7 @@ def apply( global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) @@ -925,15 +927,15 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod): uses_meta_device: bool = True - def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module): - super().__init__(quant_config, layer) + def __init__(self, quant_config: Fp8Config, moe_config: FusedMoEConfig): + super().__init__(quant_config, moe_config) assert not quant_config.is_checkpoint_fp8_serialized assert quant_config.activation_scheme == "dynamic" assert quant_config.weight_block_size is None def create_weights( self, - layer: Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -999,7 +1001,7 @@ def create_weights( initialize_online_processing(layer) - def process_weights_after_loading(self, layer: Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: # TODO(@ksayers): inplace fp8 quant kernel, initialize scales with ones if getattr(layer, "_already_called_process_weights_after_loading", False): return diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 61eb6c912a11..dca49d7ed976 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -15,17 +15,14 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.activation import ( - MoEActivation, - apply_moe_activation, -) -from vllm.model_executor.layers.fused_moe.config import ( +from vllm.model_executor.layers.fused_moe import ( FusedMoEConfig, - FusedMoEQuantConfig, -) -from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, FusedMoEMethodBase, + FusedMoEQuantConfig, + MoEActivation, + RoutedExperts, + SharedExperts, + apply_moe_activation, ) from vllm.model_executor.layers.linear import ( LinearBase, @@ -107,7 +104,7 @@ def get_quant_method( ): return UnquantizedEmbeddingMethod() return GGUFEmbeddingMethod(self) - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): # TODO: Select UnquantizedFusedMoEMethod on unquantized layers. return GGUFMoEMethod(self, layer.moe_config) return None @@ -578,7 +575,7 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -639,16 +636,17 @@ def create_weights( layer.register_parameter("w2_qweight_type", w2_qweight_type) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: return None def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: if layer.apply_router_weight_on_input: diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 458741478538..af91f7f9a16b 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -13,7 +13,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.layer import FusedMoE +from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts from vllm.model_executor.layers.linear import LinearMethodBase from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, @@ -172,7 +172,7 @@ def from_config(cls, config: dict[str, Any]) -> "GPTQConfig": def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> Union["GPTQLinearMethod", "QuantizeMethodBase"] | None: - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): # GPTQ MoE support: fall back to MoeWNA16 for broad compatibility from .moe_wna16 import MoeWNA16Config diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 7b6f1f9cf6cd..c11dffe6a76d 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -14,14 +14,13 @@ MPLinearLayerConfig, choose_mp_linear_kernel, ) -from vllm.model_executor.layers.fused_moe.config import ( +from vllm.model_executor.layers.fused_moe import ( FusedMoEConfig, - FusedMoEQuantConfig, -) -from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, FusedMoEMethodBase, + FusedMoEQuantConfig, FusedMoeWeightScaleSupported, + RoutedExperts, + SharedExperts, UnquantizedFusedMoEMethod, ) from vllm.model_executor.layers.fused_moe.oracle.int_wna16 import ( @@ -71,29 +70,28 @@ def get_moe_quant_method( config: "GPTQMarlinConfig", - layer: torch.nn.Module, + layer: RoutedExperts, prefix: str, moe_method_cls: type, ): cloned_config = deepcopy(config) - if isinstance(layer, FusedMoE): - # False = skip module, None = no override, else = Positive match - if ( - get_dynamic_override( # noqa: E712 - cloned_config, # noqa: E712 - layer_name=prefix, - ) - == False - ): # noqa: E712 - return UnquantizedFusedMoEMethod(layer.moe_config) + assert isinstance(layer, RoutedExperts) + # False = skip module, None = no override, else = Positive match + if ( + get_dynamic_override( # noqa: E712 + cloned_config, # noqa: E712 + layer_name=prefix, + ) + == False + ): # noqa: E712 + return UnquantizedFusedMoEMethod(layer.moe_config) - if prefix: - # Dynamic per module/layer rules may override base config - override_config(cloned_config, prefix=prefix) + if prefix: + # Dynamic per module/layer rules may override base config + override_config(cloned_config, prefix=prefix) - return moe_method_cls(cloned_config, layer.moe_config) - return None + return moe_method_cls(cloned_config, layer.moe_config) class GPTQMarlinConfig(QuantizationConfig): @@ -247,7 +245,7 @@ def override_quantization_method( def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> "QuantizeMethodBase | None": - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config if not check_moe_marlin_supports_layer(layer, self.group_size): @@ -522,7 +520,7 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -676,7 +674,7 @@ def create_weights( device = layer.w13_qweight.device layer.workspace = marlin_make_workspace_new(device, 4) - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: is_a_8bit = self.input_dtype is not None and self.input_dtype.itemsize == 1 if is_a_8bit: @@ -755,7 +753,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: self._setup_kernel(layer) - def _setup_kernel(self, layer: FusedMoE) -> None: + def _setup_kernel(self, layer: RoutedExperts) -> None: """Build the FusedMoEKernel for this layer.""" self.moe_quant_config = self.get_fused_moe_quant_config(layer) @@ -770,10 +768,9 @@ def _setup_kernel(self, layer: FusedMoE) -> None: w13_g_idx_sort_indices=layer.w13_g_idx_sort_indices, w2_g_idx_sort_indices=layer.w2_g_idx_sort_indices, routing_tables=layer._maybe_init_expert_routing_tables(), - shared_experts=layer.shared_experts, ) - def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig: + def get_fused_moe_quant_config(self, layer: RoutedExperts) -> FusedMoEQuantConfig: from vllm.model_executor.layers.fused_moe.config import ( gptq_marlin_moe_quant_config, ) @@ -796,7 +793,7 @@ def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantCon def select_gemm_impl( self, prepare_finalize, - layer: torch.nn.Module, + layer: RoutedExperts, ): raise ValueError( f"{self.__class__.__name__} uses the new modular kernel " @@ -805,10 +802,11 @@ def select_gemm_impl( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert not self.is_monolithic diff --git a/vllm/model_executor/layers/quantization/inc.py b/vllm/model_executor/layers/quantization/inc.py index 4457555c0764..c813a126b455 100644 --- a/vllm/model_executor/layers/quantization/inc.py +++ b/vllm/model_executor/layers/quantization/inc.py @@ -9,6 +9,7 @@ from torch.nn.parameter import Parameter from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, @@ -234,7 +235,6 @@ def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): self.extra_config = hf_to_vllm_mapper.apply_dict(self.extra_config) def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"): - from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.quantization.utils.marlin_utils import ( check_marlin_supported, check_moe_marlin_supports_layer, @@ -264,7 +264,7 @@ def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"): AWQ_TYPE_MAP[weight_bits], group_size, not sym ) - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): use_marlin = use_marlin and check_moe_marlin_supports_layer( layer, group_size ) @@ -298,7 +298,7 @@ def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"): zero_point=not sym, ) - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): if use_marlin: return AWQMarlinMoEMethod(quant_args_marlin, layer.moe_config) from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config @@ -320,7 +320,6 @@ def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"): return None def apply_gptq_quant_layer(self, layer, prefix: str, backend: str = "auto"): - from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.quantization.utils.marlin_utils import ( check_marlin_supported, check_moe_marlin_supports_layer, @@ -349,7 +348,7 @@ def apply_gptq_quant_layer(self, layer, prefix: str, backend: str = "auto"): use_marlin = (weight_bits, sym) in GPTQ_TYPE_MAP and check_marlin_supported( GPTQ_TYPE_MAP[(weight_bits, sym)], group_size, has_zp=not sym ) - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): use_marlin = use_marlin and check_moe_marlin_supports_layer( layer, group_size ) @@ -385,7 +384,7 @@ def apply_gptq_quant_layer(self, layer, prefix: str, backend: str = "auto"): dynamic={}, ) - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): if use_marlin: return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe_config) else: diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 242cc105e470..84a47c2e5cec 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -16,18 +16,15 @@ init_nvfp4_linear_kernel, ) from vllm.model_executor.layers.attention import Attention, MLAAttention -from vllm.model_executor.layers.fused_moe.activation import MoEActivation -from vllm.model_executor.layers.fused_moe.config import ( +from vllm.model_executor.layers.fused_moe import ( FusedMoEConfig, - FusedMoEQuantConfig, - RoutingMethodType, -) -from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, -) -from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, + FusedMoEQuantConfig, FusedMoeWeightScaleSupported, + MoEActivation, + RoutedExperts, + RoutingMethodType, + SharedExperts, ) from vllm.model_executor.layers.fused_moe.oracle.fp8 import ( Fp8MoeBackend, @@ -203,7 +200,7 @@ def get_quant_method( if getattr(quant_method, "backend", "") == "marlin": quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix) return quant_method - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): quant_method = self.FusedMoEMethodCls( quant_config=self, moe_config=layer.moe_config ) @@ -774,7 +771,7 @@ def maybe_make_prepare_finalize( def select_gemm_impl( self, prepare_finalize: mk.FusedMoEPrepareAndFinalizeModular, - layer: torch.nn.Module, + layer: RoutedExperts, ) -> mk.FusedMoEExpertsModular: raise ValueError( f"{self.__class__.__name__} uses the new modular kernel initialization " @@ -783,7 +780,7 @@ def select_gemm_impl( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -862,7 +859,7 @@ def create_weights( def _setup_kernel( self, - layer: FusedMoE, + layer: RoutedExperts, w13: torch.Tensor, w2: torch.Tensor, w13_scale: torch.Tensor, @@ -897,10 +894,9 @@ def _setup_kernel( fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, routing_tables=layer._maybe_init_expert_routing_tables(), - shared_experts=layer.shared_experts, ) - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: w13 = layer.w13_weight w2 = layer.w2_weight w13_scale = layer.w13_weight_scale @@ -931,7 +927,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer, w13, w2, w13_scale, w2_scale, w13_input_scale, w2_input_scale ) - def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig: + def get_fused_moe_quant_config(self, layer: RoutedExperts) -> FusedMoEQuantConfig: w1_scale = layer.w13_weight_scale w2_scale = layer.w2_weight_scale a1_scale = layer.w13_input_scale @@ -947,7 +943,7 @@ def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantCon def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, input_ids: torch.Tensor | None = None, @@ -971,10 +967,11 @@ def apply_monolithic( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert not self.is_monolithic @@ -989,6 +986,7 @@ def apply( global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) @@ -1250,7 +1248,7 @@ def uses_weight_scale_2_pattern(self) -> bool: def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -1364,7 +1362,7 @@ def create_weights( ) layer.register_parameter("w2_input_scale", w2_input_scale) - def process_weights_after_loading(self, layer: FusedMoE) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: """ Convert NVFP4 MoE weights into kernel format and setup the kernel. """ @@ -1418,12 +1416,11 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: moe_quant_config=self.moe_quant_config, moe_config=self.moe, experts_cls=self.experts_cls, - shared_experts=layer.shared_experts, routing_tables=layer._maybe_init_expert_routing_tables(), ) self.moe_kernel.fused_experts.process_weights_after_loading(layer) - def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig: + def get_fused_moe_quant_config(self, layer: RoutedExperts) -> FusedMoEQuantConfig: return make_nvfp4_moe_quant_config( backend=self.nvfp4_backend, w13_scale=layer.w13_weight_scale, @@ -1440,7 +1437,7 @@ def supports_eplb(self) -> bool: def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, input_ids: torch.Tensor | None = None, @@ -1464,10 +1461,11 @@ def apply_monolithic( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert not self.is_monolithic @@ -1482,6 +1480,7 @@ def apply( global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) @@ -1689,15 +1688,15 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs, ): - layer.intermediate_size_per_partition = intermediate_size_per_partition - layer.hidden_size = hidden_size + assert layer.intermediate_size_per_partition == intermediate_size_per_partition + assert layer.hidden_size == hidden_size layer.orig_dtype = params_dtype if hidden_size % MXFP8_BLOCK_SIZE != 0: @@ -1880,7 +1879,7 @@ def _shuffle_weights_for_trtllm(self, layer: torch.nn.Module) -> None: torch.stack(w2_scale_shuffled).contiguous(), ) - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: if getattr(layer, "_already_called_process_weights_after_loading", False): return @@ -1900,7 +1899,7 @@ def maybe_make_prepare_finalize( def select_gemm_impl( self, prepare_finalize: mk.FusedMoEPrepareAndFinalizeModular, - layer: torch.nn.Module, + layer: RoutedExperts, ) -> mk.FusedMoEExpertsModular: raise ValueError( f"{self.__class__.__name__} uses the new modular kernel initialization " @@ -1908,7 +1907,7 @@ def select_gemm_impl( ) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: # TRTLLM MXFP8 path is monolithic and does not use modular kernel config. return None @@ -1919,11 +1918,11 @@ def is_monolithic(self) -> bool: def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, input_ids: torch.Tensor | None = None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: from flashinfer.fused_moe.core import ( ActivationType, Fp8QuantizationType, @@ -2003,12 +2002,13 @@ def apply_monolithic( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert not self.is_monolithic raise NotImplementedError( "Non-monolithic MXFP8 MoE path is not yet implemented." @@ -2124,7 +2124,7 @@ def _resolve_quant_algo(self, prefix: str) -> str | None: Tries three strategies in order: 1. Direct lookup in ``quantized_layers``. 2. Packed/fused-layer lookup (unfuse via ``packed_modules_mapping``). - 3. Prefix-based lookup for FusedMoE (any child key starts with + 3. Prefix-based lookup for RoutedExperts (any child key starts with ``prefix + "."``). Returns the upper-cased quant_algo string, or *None* if the prefix @@ -2151,7 +2151,7 @@ def _resolve_quant_algo(self, prefix: str) -> str | None: f"{algos}. All shards must use the same quantization." ) - # 3. Prefix-based lookup (for FusedMoE / parent modules) + # 3. Prefix-based lookup (for RoutedExperts / parent modules) prefix_dot = prefix + "." for key, info in self.quantized_layers.items(): if key.startswith(prefix_dot): @@ -2185,7 +2185,7 @@ def get_quant_method( # Layer not in quantized_layers — leave unquantized return UnquantizedLinearMethod() - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): if quant_algo == "FP8": return ModelOptFp8MoEMethod( quant_config=self.fp8_config, diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index e5ef3f4c3168..436502e39447 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -6,18 +6,19 @@ import torch from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group +from vllm.model_executor.layers.fused_moe import ( + FusedMoEConfig, + FusedMoEMethodBase, + FusedMoeWeightScaleSupported, + RoutedExperts, + SharedExperts, +) from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, int4_w4a16_moe_quant_config, int8_w8a16_moe_quant_config, ) -from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, - FusedMoEConfig, - FusedMoEMethodBase, - FusedMoeWeightScaleSupported, -) from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( UnquantizedFusedMoEMethod, ) @@ -166,7 +167,7 @@ def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> "QuantizeMethodBase | None": if is_layer_skipped_quant(prefix, self.modules_to_not_convert): - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): return UnquantizedFusedMoEMethod(layer.moe_config) return UnquantizedLinearMethod() elif isinstance(layer, LinearBase): @@ -202,7 +203,7 @@ def get_quant_method( ) else: raise ValueError("moe_wna16 only support gptq and awq.") - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): return MoeWNA16Method(self, layer.moe_config) return None @@ -224,7 +225,7 @@ def __init__(self, quant_config: MoeWNA16Config, moe: "FusedMoEConfig") -> None: def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -343,7 +344,7 @@ def create_weights( set_weight_attrs(param, extra_weight_attrs) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: weight_bits = self.quant_config.weight_bits has_zp = self.quant_config.has_zp @@ -364,10 +365,11 @@ def get_fused_moe_quant_config( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe import fused_experts diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 0a516831c4ec..95bd9d3ddca7 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -7,15 +7,14 @@ from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import ( - FusedMoE, FusedMoEConfig, FusedMoEMethodBase, -) -from vllm.model_executor.layers.fused_moe import modular_kernel as mk -from vllm.model_executor.layers.fused_moe.config import ( FusedMoEParallelConfig, FusedMoEQuantConfig, + RoutedExperts, + SharedExperts, ) +from vllm.model_executor.layers.fused_moe import modular_kernel as mk from vllm.model_executor.layers.fused_moe.oracle.mxfp4 import ( TRITON_BACKENDS, Mxfp4MoeBackend, @@ -87,7 +86,7 @@ def get_quant_method( "UnquantizedLinearMethod.", ) return UnquantizedLinearMethod() - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): return GptOssMxfp4MoEMethod(layer.moe_config) elif isinstance(layer, Attention): logger.debug_once( @@ -178,7 +177,7 @@ def maybe_roundup_sizes( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -272,7 +271,7 @@ def create_weights( def _setup_kernel( self, - layer: FusedMoE, + layer: RoutedExperts, w13: torch.Tensor, w2: torch.Tensor, w13_scale: torch.Tensor, @@ -365,10 +364,9 @@ def _setup_kernel( mxfp4_backend=self.mxfp4_backend, experts_cls=self.experts_cls, routing_tables=layer._maybe_init_expert_routing_tables(), - shared_experts=layer.shared_experts, ) - def process_weights_after_loading(self, layer): + def process_weights_after_loading(self, layer: RoutedExperts) -> None: w13 = layer.w13_weight w2 = layer.w2_weight w13_scale = layer.w13_weight_scale @@ -382,7 +380,7 @@ def process_weights_after_loading(self, layer): self._setup_kernel(layer, w13, w2, w13_scale, w2_scale, w13_bias, w2_bias) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: w1_scale = layer.w13_weight_scale w2_scale = layer.w2_weight_scale @@ -409,7 +407,7 @@ def get_fused_moe_quant_config( def select_gemm_impl( self, prepare_finalize: mk.FusedMoEPrepareAndFinalize, - layer: torch.nn.Module, + layer: RoutedExperts, ) -> mk.FusedMoEExpertsModular: raise ValueError( f"{self.__class__.__name__} uses the new modular kernel " @@ -418,10 +416,11 @@ def select_gemm_impl( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert not self.is_monolithic @@ -436,12 +435,13 @@ def apply( global_num_experts=layer.global_num_experts, apply_router_weight_on_input=layer.apply_router_weight_on_input, expert_map=layer.expert_map, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, input_ids: torch.Tensor | None = None, @@ -743,10 +743,11 @@ def select_gemm_impl( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: assert not self.is_monolithic @@ -761,12 +762,13 @@ def apply( global_num_experts=layer.global_num_experts, apply_router_weight_on_input=layer.apply_router_weight_on_input, expert_map=layer.expert_map, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, input_ids: torch.Tensor | None = None, diff --git a/vllm/model_executor/layers/quantization/online/base.py b/vllm/model_executor/layers/quantization/online/base.py index 315dcfacffcd..21230177aae2 100644 --- a/vllm/model_executor/layers/quantization/online/base.py +++ b/vllm/model_executor/layers/quantization/online/base.py @@ -11,7 +11,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( - FusedMoE, + RoutedExperts, ) from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( UnquantizedFusedMoEMethod, @@ -118,7 +118,7 @@ def get_quant_method( return Mxfp8OnlineLinearMethod() else: return Fp8PerTensorOnlineLinearMethod() - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): if should_ignore_layer( prefix, ignore=self.ignored_layers, diff --git a/vllm/model_executor/layers/quantization/online/fp8.py b/vllm/model_executor/layers/quantization/online/fp8.py index 9cb697289d7e..dd882e209884 100644 --- a/vllm/model_executor/layers/quantization/online/fp8.py +++ b/vllm/model_executor/layers/quantization/online/fp8.py @@ -8,7 +8,6 @@ if TYPE_CHECKING: import vllm.model_executor.layers.fused_moe.modular_kernel as mk - from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, ) @@ -18,6 +17,7 @@ from vllm import _custom_ops as ops from vllm.config import get_current_vllm_config from vllm.model_executor.kernels.linear import init_fp8_linear_kernel +from vllm.model_executor.layers.fused_moe import RoutedExperts from vllm.model_executor.layers.fused_moe.oracle.fp8 import ( select_fp8_moe_backend, ) @@ -308,7 +308,7 @@ def __init__( def _setup_kernel( self, - layer: "FusedMoE", + layer: RoutedExperts, w13: torch.Tensor, w2: torch.Tensor, w13_scale: torch.Tensor, @@ -349,7 +349,6 @@ def _setup_kernel( fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, routing_tables=layer._maybe_init_expert_routing_tables(), - shared_experts=layer.shared_experts, ) def get_fused_moe_quant_config( diff --git a/vllm/model_executor/layers/quantization/online/int8.py b/vllm/model_executor/layers/quantization/online/int8.py index 4b4c87fbce96..f61ecfe6000e 100644 --- a/vllm/model_executor/layers/quantization/online/int8.py +++ b/vllm/model_executor/layers/quantization/online/int8.py @@ -7,11 +7,11 @@ from torch.nn import Module if TYPE_CHECKING: - from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, ) +from vllm.model_executor.layers.fused_moe import RoutedExperts from vllm.model_executor.layers.fused_moe.oracle.int8 import ( make_int8_moe_kernel, make_int8_moe_quant_config, @@ -91,7 +91,7 @@ def _quantize_weights(self, layer: Module) -> None: replace_parameter(layer, "w13_scale", w13_scale) replace_parameter(layer, "w2_scale", w2_scale) - def _setup_kernel(self, layer: "FusedMoE") -> None: + def _setup_kernel(self, layer: RoutedExperts) -> None: self.moe_quant_config = self.get_fused_moe_quant_config(layer) assert self.moe_quant_config is not None assert self.experts_cls is not None @@ -100,7 +100,6 @@ def _setup_kernel(self, layer: "FusedMoE") -> None: moe_config=self.moe, experts_cls=self.experts_cls, routing_tables=layer._maybe_init_expert_routing_tables(), - shared_experts=layer.shared_experts, ) def get_fused_moe_quant_config( diff --git a/vllm/model_executor/layers/quantization/online/moe_base.py b/vllm/model_executor/layers/quantization/online/moe_base.py index 417ce1770f9e..32eb81601bea 100644 --- a/vllm/model_executor/layers/quantization/online/moe_base.py +++ b/vllm/model_executor/layers/quantization/online/moe_base.py @@ -6,8 +6,12 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk -from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase -from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe import ( + FusedMoEMethodBase, + FusedMoEQuantConfig, + RoutedExperts, + SharedExperts, +) from vllm.model_executor.model_loader.reload.layerwise import ( initialize_online_processing, ) @@ -127,11 +131,11 @@ def supports_eplb(self) -> bool: def apply_monolithic( self, - layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, input_ids: torch.Tensor | None = None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert self.is_monolithic assert self.moe_kernel is not None return self.moe_kernel.apply_monolithic( @@ -151,12 +155,13 @@ def apply_monolithic( def apply( self, - layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: assert not self.is_monolithic assert self.moe_kernel is not None return self.moe_kernel.apply( @@ -169,5 +174,6 @@ def apply( global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/quantization/online/mxfp8.py b/vllm/model_executor/layers/quantization/online/mxfp8.py index 39a32604442c..3959be4aa24e 100644 --- a/vllm/model_executor/layers/quantization/online/mxfp8.py +++ b/vllm/model_executor/layers/quantization/online/mxfp8.py @@ -10,9 +10,9 @@ if TYPE_CHECKING: import vllm.model_executor.layers.fused_moe.modular_kernel as mk - from vllm.model_executor.layers.fused_moe import FusedMoE - from vllm.model_executor.layers.fused_moe.config import ( + from vllm.model_executor.layers.fused_moe import ( FusedMoEQuantConfig, + RoutedExperts, ) from vllm.model_executor.layers.fused_moe.oracle.fp8 import Fp8MoeBackend @@ -161,7 +161,7 @@ def _quantize_mxfp8_moe_weight( def _setup_kernel( self, - layer: "FusedMoE", + layer: "RoutedExperts", w13: torch.Tensor, w2: torch.Tensor, w13_scale: torch.Tensor, @@ -200,7 +200,6 @@ def _setup_kernel( fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, routing_tables=layer._maybe_init_expert_routing_tables(), - shared_experts=layer.shared_experts, ) def get_fused_moe_quant_config( diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index 6aaf9a645880..2ac1af9d541a 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -9,7 +9,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, @@ -160,7 +160,7 @@ def get_quant_method( if isinstance(layer, Attention): return QuarkKVCacheMethod(self) - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): return QuarkMoEMethod.get_moe_method(self, module=layer, layer_name=prefix) return None diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index c50d4396ee39..1bdc15936717 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -12,11 +12,12 @@ from vllm.config import get_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( - FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported, MoEActivation, + RoutedExperts, + SharedExperts, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEParallelConfig, @@ -74,7 +75,7 @@ def __init__(self, moe: FusedMoEConfig): @staticmethod def get_moe_method( quant_config: "QuarkConfig", # type: ignore # noqa E501 # noqa F821 - module: torch.nn.Module, + module: RoutedExperts, layer_name: str, ) -> "QuarkMoEMethod": layer_quant_config = quant_config._find_matched_config(layer_name, module) @@ -178,7 +179,7 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -234,7 +235,7 @@ def create_weights( torch.ones(num_experts, dtype=torch.float32), requires_grad=False ) layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Add PER-TENSOR quantization for FusedMoE.weight_loader. + # Add PER-TENSOR quantization for RoutedExperts.weight_loader. extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} ) @@ -256,7 +257,7 @@ def create_weights( requires_grad=False, ) layer.register_parameter("w2_weight_scale", w2_weight_scale) - # Add PER-CHANNEL quantization for FusedMoE.weight_loader. + # Add PER-CHANNEL quantization for RoutedExperts.weight_loader. extra_weight_attrs.update( {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value} ) @@ -301,7 +302,7 @@ def create_weights( else: layer.w13_bias, layer.w2_bias = None, None - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: # Fp8 moe kernels require a single activation scale. # We take the max of all the scales in case they differ. if self.static_input_scales: @@ -436,7 +437,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: ) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: return fp8_w8a8_moe_quant_config( w1_scale=layer.w13_weight_scale, @@ -451,10 +452,11 @@ def get_fused_moe_quant_config( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: if self.rocm_aiter_moe_enabled: @@ -765,12 +767,13 @@ def get_fused_moe_quant_config( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe import fused_experts return fused_experts( @@ -805,7 +808,7 @@ def __init__( def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -872,7 +875,7 @@ def create_weights( set_weight_attrs(w13_weight_scale_2, extra_weight_attrs) set_weight_attrs(w2_weight_scale_2, extra_weight_attrs) - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( layer.w13_weight.data, layer.w2_weight.data ) @@ -917,10 +920,11 @@ def get_fused_moe_quant_config(self, layer): def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( @@ -1096,7 +1100,7 @@ def get_packed_dim(self, dim: int, quant_dtype: str): def create_weights( self, - layer: torch.nn.Module, + layer: RoutedExperts, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -1201,7 +1205,7 @@ def create_weights( layer.w13_input_scale = None layer.w2_input_scale = None - def process_weights_after_loading(self, layer): + def process_weights_after_loading(self, layer: RoutedExperts) -> None: if self.static_input_scales and self.input_dtype == "fp8": # firstly, process activations if fp8 static input if layer.w13_input_scale is None or layer.w2_input_scale is None: @@ -1296,7 +1300,7 @@ def process_weights_after_loading(self, layer): self.moe_quant_config = self.get_fused_moe_quant_config(layer) torch.accelerator.empty_cache() - def _setup_kernel_via_oracle(self, layer: FusedMoE): + def _setup_kernel_via_oracle(self, layer: RoutedExperts): """Setup kernel using oracle functions for w_mxfp4 scheme.""" w13 = layer.w13_weight w2 = layer.w2_weight @@ -1345,11 +1349,10 @@ def _setup_kernel_via_oracle(self, layer: FusedMoE): mxfp4_backend=self.mxfp4_backend, experts_cls=self.experts_cls, routing_tables=layer._maybe_init_expert_routing_tables(), - shared_experts=layer.shared_experts, ) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: # For w_mxfp4 with oracle backend, use oracle function if self.ocp_mx_scheme == "w_mxfp4" and self.mxfp4_backend not in ( @@ -1413,10 +1416,11 @@ def is_monolithic(self) -> bool: def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: # For oracle kernel or emulation kernel @@ -1454,7 +1458,7 @@ def apply( def apply_monolithic( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, input_ids: torch.Tensor | None = None, @@ -1482,7 +1486,7 @@ def __init__( ): super().__init__(weight_config, input_config, moe) - def process_weights_after_loading(self, layer): + def process_weights_after_loading(self, layer: RoutedExperts) -> None: from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig w13_bias = layer.w13_bias.to(torch.float32) @@ -1553,7 +1557,7 @@ def process_weights_after_loading(self, layer): ) def get_fused_moe_quant_config( - self, layer: torch.nn.Module + self, layer: RoutedExperts ) -> FusedMoEQuantConfig | None: return mxfp4_w4a8_moe_quant_config( w1_scale=self.w13_precision_config, @@ -1571,11 +1575,10 @@ def is_monolithic(self) -> bool: def apply_monolithic( self, - layer: torch.nn.Module, + layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, - expert_map: torch.Tensor | None = None, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + ) -> torch.Tensor: if layer.enable_eplb: raise NotImplementedError( "EPLB not supported for `QuarkW4MXFp4MoEMethod_OSS` yet." @@ -1595,7 +1598,7 @@ def apply_monolithic( topk=layer.top_k, renormalize=layer.renormalize, global_num_experts=layer.global_num_experts, - expert_map=expert_map, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, apply_router_weight_on_input=layer.apply_router_weight_on_input, unpadded_N_w1=self.moe.intermediate_size_per_partition_unpadded * 2, diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index ef0bf2bf7aca..5bd08de9a610 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -21,7 +21,7 @@ ) if TYPE_CHECKING: - from vllm.model_executor.layers.fused_moe.layer import FusedMoE + from vllm.model_executor.layers.fused_moe import RoutedExperts from vllm.model_executor.layers.fused_moe.oracle.nvfp4 import ( NvFp4MoeBackend, ) @@ -80,7 +80,7 @@ def interleave_linear_and_gate( def prepare_nvfp4_moe_layer_for_flashinfer_cutedsl( - layer: "FusedMoE", + layer: "RoutedExperts", w13: torch.Tensor, w13_scale: torch.Tensor, w13_scale_2: torch.Tensor, @@ -286,7 +286,7 @@ def prepare_static_weights_for_trtllm_fp4_moe( def prepare_nvfp4_moe_layer_for_fi_or_cutlass( backend: "NvFp4MoeBackend", - layer: "FusedMoE", + layer: "RoutedExperts", w13: torch.Tensor, w13_scale: torch.Tensor, w13_scale_2: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index d659effd70ff..39e1083a81dd 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -8,6 +8,7 @@ import vllm.envs as envs from vllm import _custom_ops as ops from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe import RoutedExperts from vllm.model_executor.layers.linear import LinearBase from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.utils.int8_utils import ( @@ -226,7 +227,7 @@ def check_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool: )[0] -def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool: +def check_moe_marlin_supports_layer(layer: RoutedExperts, group_size: int) -> bool: if current_platform.is_rocm(): return False hidden_size = layer.hidden_size @@ -471,7 +472,7 @@ def get__quant_fp8_method() -> QuantFP8: return _quant_fp8_method -def get_marlin_input_dtype(prefix: str | None = None): +def get_marlin_input_dtype(prefix: str | None = None): # ? if envs.VLLM_MARLIN_INPUT_DTYPE is None: return elif envs.VLLM_MARLIN_INPUT_DTYPE.lower() == "int8": From e218fc785be7d2ad9b2d9c34320a81525cf03680 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 23 Apr 2026 21:28:09 +0000 Subject: [PATCH 108/191] fix cruft Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/fused_moe_method_base.py | 2 +- .../quantization/compressed_tensors/compressed_tensors.py | 2 +- vllm/model_executor/layers/quantization/utils/marlin_utils.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py index 85189e873666..196ef4c202c9 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py @@ -166,7 +166,7 @@ def apply( x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - shared_experts: "SharedExperts" | None, + shared_experts: "SharedExperts | None", shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: raise NotImplementedError diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 92ef38462905..8194319feecb 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -198,7 +198,7 @@ def get_quant_method( ) return None - def _add_fused_moe_to_target_scheme_map(self): # XXXXXXXXXXXXXXXXXXXXXX + def _add_fused_moe_to_target_scheme_map(self): """ Helper function to update target_scheme_map since linear layers get fused into FusedMoE diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 39e1083a81dd..4600cb36918b 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -472,7 +472,7 @@ def get__quant_fp8_method() -> QuantFP8: return _quant_fp8_method -def get_marlin_input_dtype(prefix: str | None = None): # ? +def get_marlin_input_dtype(prefix: str | None = None): if envs.VLLM_MARLIN_INPUT_DTYPE is None: return elif envs.VLLM_MARLIN_INPUT_DTYPE.lower() == "int8": From f37de99363b0adca7ef72ddc518b8b4a77dac51d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 23 Apr 2026 21:37:27 +0000 Subject: [PATCH 109/191] fix broken imports Signed-off-by: Bill Nell --- vllm/model_executor/layers/quantization/awq.py | 2 +- vllm/model_executor/layers/quantization/gptq.py | 2 +- vllm/model_executor/layers/quantization/inc.py | 2 +- vllm/model_executor/layers/quantization/quark/quark.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 28c02e54d536..edacfc76334b 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -10,7 +10,7 @@ from vllm import _custom_ops as ops from vllm import envs from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts +from vllm.model_executor.layers.fused_moe import RoutedExperts from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index af91f7f9a16b..db2db086815d 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -13,7 +13,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts +from vllm.model_executor.layers.fused_moe import RoutedExperts from vllm.model_executor.layers.linear import LinearMethodBase from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, diff --git a/vllm/model_executor/layers/quantization/inc.py b/vllm/model_executor/layers/quantization/inc.py index c813a126b455..1db74ad21376 100644 --- a/vllm/model_executor/layers/quantization/inc.py +++ b/vllm/model_executor/layers/quantization/inc.py @@ -9,7 +9,7 @@ from torch.nn.parameter import Parameter from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts +from vllm.model_executor.layers.fused_moe import RoutedExperts from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index 2ac1af9d541a..6a4e1bcb5a0c 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -9,7 +9,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts +from vllm.model_executor.layers.fused_moe import RoutedExperts from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, From 71d27a020c277d9bc32d4033fadf4a3cf01cf44a Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 23 Apr 2026 23:06:56 +0000 Subject: [PATCH 110/191] fixes Signed-off-by: Bill Nell --- tests/kernels/moe/test_zero_expert_moe.py | 1 + vllm/model_executor/layers/fused_moe/__init__.py | 4 ++-- .../layers/fused_moe/fused_moe_method_base.py | 4 ++-- .../layers/fused_moe/fused_moe_modular_method.py | 2 +- .../quantization/compressed_tensors/compressed_tensors.py | 8 ++++---- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/kernels/moe/test_zero_expert_moe.py b/tests/kernels/moe/test_zero_expert_moe.py index d8f900256ec3..f10459aa5192 100644 --- a/tests/kernels/moe/test_zero_expert_moe.py +++ b/tests/kernels/moe/test_zero_expert_moe.py @@ -183,6 +183,7 @@ def test_zero_expert_moe_output_decomposition(zero_expert_moe, num_tokens): x=hidden_states, topk_weights=topk_weights, topk_ids=topk_ids, + shared_experts=None, shared_experts_input=None, ) diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index a7d0229048ce..67f0034076d9 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import contextmanager -from typing import Any +from typing import Any, TypeAlias from vllm.model_executor.layers.fused_moe.activation import ( MoEActivation, @@ -44,7 +44,7 @@ # Temporary alias for FusedMoE, eventually we be its own class. -RoutedExperts = FusedMoE +RoutedExperts: TypeAlias = FusedMoE @contextmanager diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py index 196ef4c202c9..b68279d2cbe4 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py @@ -162,7 +162,7 @@ def is_monolithic(self) -> bool: def apply( self, - layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, @@ -173,7 +173,7 @@ def apply( def apply_monolithic( self, - layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, router_logits: torch.Tensor, input_ids: torch.Tensor | None = None, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index 42446c4876a0..e8300b5f6af2 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -84,7 +84,7 @@ def get_fused_moe_quant_config( def apply( self, - layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 + layer: "RoutedExperts", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 8194319feecb..72a1dcd168cd 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -24,7 +24,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import RoutedExperts +from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, @@ -192,7 +192,7 @@ def get_quant_method( if isinstance(layer, Attention): return CompressedTensorsKVCacheMethod(self) - if isinstance(layer, RoutedExperts): + if isinstance(layer, FusedMoE): # RoutedExperts): return CompressedTensorsMoEMethod.get_moe_method( self, layer, layer_name=prefix ) @@ -207,10 +207,10 @@ def _add_fused_moe_to_target_scheme_map(self): """ if ( "Linear" not in self.target_scheme_map - or "RoutedExperts" in self.target_scheme_map + or "FusedMoE" in self.target_scheme_map ): return - self.target_scheme_map["RoutedExperts"] = self.target_scheme_map["Linear"] + self.target_scheme_map["FusedMoE"] = self.target_scheme_map["Linear"] @classmethod def from_config(cls, config: dict[str, Any]) -> "CompressedTensorsConfig": From 04858c28b4264c16db15aa25ea45a310da67687a Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sat, 25 Apr 2026 01:04:20 +0000 Subject: [PATCH 111/191] fix up new quant method Signed-off-by: Bill Nell --- .../compressed_tensors/compressed_tensors.py | 11 ++++++++-- .../schemes/compressed_tensors_wNa16.py | 2 ++ .../layers/quantization/humming.py | 22 ++++++++++--------- vllm/utils/__init__.py | 4 +++- 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 72a1dcd168cd..8d6f7441e635 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -24,7 +24,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import RoutedExperts from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, @@ -159,9 +159,12 @@ def get_quant_method( layer: torch.nn.Module, prefix: str, ) -> "QuantizeMethodBase | None": + print(f"GOT HERE {layer.__class__, isinstance(layer, LinearBase)}") + if isinstance(layer, LinearBase): # collect schemes quant_scheme = self.get_scheme(layer=layer, layer_name=prefix) + print(f"GOT HERE QS {quant_scheme}") input_tfms, output_tfms = get_linear_transform_schemes( layer, prefix, self.transform_config, self.packed_modules_mapping ) @@ -184,7 +187,9 @@ def get_quant_method( if isinstance(layer, ParallelLMHead): try: quant_scheme = self.get_scheme(layer=layer, layer_name=prefix) + print(f"FOUND QS {quant_scheme}") except ValueError: + print(f"FAILED FOUND QS {quant_scheme}") quant_scheme = None if quant_scheme is not None: layer.scheme = quant_scheme @@ -192,10 +197,11 @@ def get_quant_method( if isinstance(layer, Attention): return CompressedTensorsKVCacheMethod(self) - if isinstance(layer, FusedMoE): # RoutedExperts): + if isinstance(layer, RoutedExperts): return CompressedTensorsMoEMethod.get_moe_method( self, layer, layer_name=prefix ) + print("NEVER!!!!!!!!!!!!!!!!!!!!!") return None def _add_fused_moe_to_target_scheme_map(self): @@ -928,6 +934,7 @@ def create_weights( details """ weight_loader = extra_weight_attrs.get("weight_loader") + print(f"SCHEME = {layer.__class__, layer.scheme, layer.prefix}") layer.scheme.create_weights( layer=layer, input_size=input_size, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index 1883d4ae322c..08eafdc48215 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -106,6 +106,8 @@ def create_weights( has_g_idx=self.has_g_idx, ) + print(f"LLC {mp_linear_kernel_config}") + kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config) if kernel_type.__name__ not in self._kernel_backends_being_used: diff --git a/vllm/model_executor/layers/quantization/humming.py b/vllm/model_executor/layers/quantization/humming.py index 59f9c2ee9b97..77ce10128abc 100644 --- a/vllm/model_executor/layers/quantization/humming.py +++ b/vllm/model_executor/layers/quantization/humming.py @@ -9,16 +9,17 @@ import torch from vllm import envs +from vllm.model_executor.layers.fused_moe import ( + FusedMoEMethodBase, + RoutedExperts, + SharedExperts, +) from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, FusedMoEQuantDesc, ) -from vllm.model_executor.layers.fused_moe.layer import ( - FusedMoE, - FusedMoEMethodBase, -) from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( UnquantizedFusedMoEMethod, ) @@ -185,7 +186,7 @@ def compressed_tensors_get_config(config: dict[str, Any], key: str): class HummingConfig(QuantizationConfig): - packed_modules_mapping = {} + packed_modules_mapping: dict[str, list[str]] = {} def __init__(self, full_config: dict[str, Any] | None = None): assert_humming_available() @@ -330,7 +331,7 @@ def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> "QuantizeMethodBase | None": layer_type = "other" - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): layer_type = "moe" elif isinstance(layer, LinearBase): layer_type = "linear" @@ -343,13 +344,13 @@ def get_quant_method( quant_config = self.get_quant_config_for_layer(prefix, layer_type) if quant_config is None: - if isinstance(layer, FusedMoE): + if isinstance(layer, RoutedExperts): return UnquantizedFusedMoEMethod(layer.moe_config) elif isinstance(layer, LinearBase): return UnquantizedLinearMethod() elif isinstance(layer, LinearBase): return HummingLinearMethod(quant_config) - elif isinstance(layer, FusedMoE): + elif isinstance(layer, RoutedExperts): return HummingMoEMethod(quant_config, layer.moe_config) return None @@ -810,7 +811,7 @@ def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantCon _w2=w2_quant_desc, ) - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + def process_weights_after_loading(self, layer: RoutedExperts) -> None: if getattr(self, "processed", False): return self.processed = True @@ -936,10 +937,11 @@ def select_gemm_impl( def apply( self, - layer: FusedMoE, + layer: RoutedExperts, x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts: SharedExperts | None, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: workspace1, workspace2, output = self.experts.make_workspaces( diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index bf455c261f4f..7040fb52b052 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -46,4 +46,6 @@ def _check_bases(cls): if _check_bases(b): return True - return _check_bases(module.__class__) + res = _check_bases(module.__class__) + print(f"IS_MOE_LAYER[{module.__class__}] = {res}") + return res From 54460291d6d006016d4811037ce9fbbd6458c392 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 27 Apr 2026 19:24:44 +0000 Subject: [PATCH 112/191] fix lint Signed-off-by: Bill Nell --- .../layers/fused_moe/unquantized_fused_moe_method.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index c3e78bc4e768..72cab5579f24 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -312,7 +312,7 @@ def forward_cuda( x, topk_weights, topk_ids, - shared_experts=shared_experts, + shared_experts, shared_experts_input, ) From 89395fa7b2f3f3bca900819241063519c3ff032e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 27 Apr 2026 19:32:33 +0000 Subject: [PATCH 113/191] fix lint Signed-off-by: Bill Nell --- vllm/model_executor/layers/quantization/mxfp4.py | 3 +-- vllm/model_executor/layers/quantization/quark/quark_moe.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 95bd9d3ddca7..a56cafe5391e 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -598,7 +598,7 @@ def create_weights( def _setup_kernel( self, - layer: FusedMoE, + layer: RoutedExperts, w13: torch.Tensor, w2: torch.Tensor, w13_scale: torch.Tensor, @@ -691,7 +691,6 @@ def _setup_kernel( mxfp4_backend=self.mxfp4_backend, experts_cls=self.experts_cls, routing_tables=layer._maybe_init_expert_routing_tables(), - shared_experts=layer.shared_experts, ) def process_weights_after_loading(self, layer): diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 1bdc15936717..e61adf4712f1 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -1578,6 +1578,7 @@ def apply_monolithic( layer: RoutedExperts, x: torch.Tensor, router_logits: torch.Tensor, + input_ids: torch.Tensor | None = None, ) -> torch.Tensor: if layer.enable_eplb: raise NotImplementedError( From ced1799dd77fad9fd2392c5dd303a707eac94aed Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 23 Apr 2026 20:50:13 +0000 Subject: [PATCH 114/191] expert map manager Signed-off-by: Bill Nell --- .../layers/fused_moe/expert_map_manager.py | 514 ++++++++++++++++++ 1 file changed, 514 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/expert_map_manager.py diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py new file mode 100644 index 000000000000..5364f4163102 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -0,0 +1,514 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Expert Map Manager for MoE layers. + +This module contains the ExpertMapManager class which manages expert ID +mappings and placement strategies for Expert Parallelism in MoE models. +""" + +import torch + +from vllm.config.parallel import ExpertPlacementStrategy +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig +from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( + init_aiter_topK_meta_data, +) + +logger = init_logger(__name__) + + +def determine_expert_map( + ep_size: int, + ep_rank: int, + global_num_experts: int, + expert_placement_strategy: ExpertPlacementStrategy = "linear", + num_fused_shared_experts: int = 0, + return_expert_mask: bool = False, +) -> tuple[int, torch.Tensor | None, torch.Tensor | None]: + """ + Calculates how many experts should be assigned to each rank for EP and + creates a mapping from global to local expert index. Experts are + distributed evenly across ranks. Any remaining are assigned to the + last rank. + + Args: + ep_size: The size of the expert parallel group + ep_rank: The rank of the current process in the expert parallel + group + global_num_experts: The total number of experts in the model. + expert_placement_strategy: The expert placement strategy. + num_fused_shared_experts: Number of fused shared experts (for AITER) + return_expert_mask: Whether to return expert mask for AITER + + Returns: + tuple[int, Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple containing: + - local_num_experts (int): The number of experts assigned + to the current rank. + - expert_map (Optional[torch.Tensor]): A tensor of shape + (global_num_experts,) mapping from global to local index. + Contains -1 for experts not assigned to the current rank. + Returns None if ep_size is 1. + - expert_mask (Optional[torch.Tensor]): A tensor of shape + (global_num_experts + num_fused_shared_experts + 1,) + containing 1 for experts assigned to the current rank + and 0 for sentinel. + Returns None if ep_size is 1. + Used only when AITER MOE is enabled. + """ + from typing import get_args + + assert ep_size > 0 + if ep_size == 1: + return (global_num_experts, None, None) + + # Distribute experts as evenly as possible to each rank. + base_experts = global_num_experts // ep_size + remainder = global_num_experts % ep_size + local_num_experts = base_experts + 1 if ep_rank < remainder else base_experts + + # Create a tensor of size num_experts filled with -1 + expert_map = torch.full((global_num_experts,), -1, dtype=torch.int32) + # Create an expert map for the local experts + if expert_placement_strategy == "linear": + start_idx = ep_rank * base_experts + min(ep_rank, remainder) + expert_map[start_idx : start_idx + local_num_experts] = torch.arange( + 0, local_num_experts, dtype=torch.int32 + ) + elif expert_placement_strategy == "round_robin": + local_log_experts = torch.arange( + ep_rank, global_num_experts, ep_size, dtype=torch.int32 + ) + + expert_map[local_log_experts] = torch.arange( + 0, local_num_experts, dtype=torch.int32 + ) + else: + raise ValueError( + "Unsupported expert placement strategy " + f"'{expert_placement_strategy}', expected one of " + f"{get_args(ExpertPlacementStrategy)}" + ) + + expert_mask = None + if return_expert_mask: + expert_mask = torch.ones( + (global_num_experts + num_fused_shared_experts + 1,), dtype=torch.int32 + ) + expert_mask[-1] = 0 + expert_mask[:global_num_experts] = expert_map > -1 + expert_map = torch.cat( + ( + expert_map, + torch.tensor( + [local_num_experts + i for i in range(num_fused_shared_experts)], + dtype=torch.int32, + ), + ), + dim=0, + ) + + return (local_num_experts, expert_map, expert_mask) + + +def determine_expert_placement_strategy( + expert_placement_strategy: ExpertPlacementStrategy, + moe_parallel_config: FusedMoEParallelConfig, + num_expert_group: int | None, + num_redundant_experts: int, + enable_eplb: bool, +) -> ExpertPlacementStrategy: + if expert_placement_strategy == "round_robin": + round_robin_supported = ( + (num_expert_group is not None and num_expert_group > 1) + and num_redundant_experts == 0 + and not enable_eplb + ) + + if not round_robin_supported: + logger.warning( + "Round-robin expert placement is only supported for " + "models with multiple expert groups and no redundant " + "experts. Falling back to linear expert placement." + ) + return "linear" + if ( + moe_parallel_config.use_all2all_kernels + and not moe_parallel_config.use_deepep_ll_kernels + and not moe_parallel_config.use_nixl_ep_kernels + ): + logger.warning( + "Round-robin expert placement currently only supports " + "the DeepEP low-latency or NIXL EP backend, but '%s' was configured. " + "Falling back to linear expert placement.", + moe_parallel_config.all2all_backend, + ) + return "linear" + + return expert_placement_strategy + + +class ExpertMapManager: + """ + Manages expert ID mappings and placement for Expert Parallelism. + + Responsibilities: + - Calculate local vs global expert counts + - Map between global, local, and physical expert IDs + - Manage placement strategies (linear, round_robin) + - Maintain routing tables for round-robin placement + - Support dynamic reconfiguration of EP topology + """ + + def __init__( + self, + max_num_batched_tokens: int, + top_k: int, + global_num_experts: int, + logical_num_experts: int, + num_redundant_experts: int, + num_expert_group: int | None, + moe_parallel_config: FusedMoEParallelConfig, + placement_strategy: ExpertPlacementStrategy, + enable_eplb: bool, + num_fused_shared_experts: int = 0, + rocm_aiter_enabled: bool = False, + device: torch.device | None = None, + ): + """ + Initialize expert map manager. + + Args: + global_num_experts: Total number of experts across all ranks + logical_num_experts: Number of logical (non-redundant) experts + moe_parallel_config: MoE parallel configuration (contains ep_size, + ep_rank, backend flags) + placement_strategy: Strategy for placing experts ('linear' or 'round_robin') + num_fused_shared_experts: Number of fused shared experts (for AITER) + rocm_aiter_enabled: Whether ROCm AITER fusion is enabled + device: Device for tensor allocations + """ + self.global_num_experts = global_num_experts + self.logical_num_experts = logical_num_experts + self.moe_parallel_config = moe_parallel_config + self.num_fused_shared_experts = num_fused_shared_experts + self.rocm_aiter_enabled = rocm_aiter_enabled + self.device = device + + if moe_parallel_config.use_ep: + # Determine expert placement strategy before creating manager + # TODO move into EMM + placement_strategy = determine_expert_placement_strategy( + expert_placement_strategy=placement_strategy, + moe_parallel_config=moe_parallel_config, + num_expert_group=num_expert_group, + num_redundant_experts=num_redundant_experts, + enable_eplb=enable_eplb, + ) + + # Determine effective placement strategy + self._placement_strategy = self._determine_placement_strategy( + placement_strategy + ) + + # Calculate expert mappings + self._calculate_expert_maps() + + # Initialize routing tables if needed + self._maybe_init_routing_tables() + + self._init_aiter_shared_experts_topK_buffer( + dp_size=self.moe_parallel_config.dp_size, + top_k=top_k, + max_num_batched_tokens=max_num_batched_tokens, + ) + + if self.use_ep and self.rocm_aiter_enabled: + expert_mask = self.expert_mask + assert expert_mask is None or torch.all( + (expert_mask == 0) | (expert_mask == 1) + ), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s." + + # Log EP configuration (move into EMM?) + if self.use_ep: + logger.info_once( + "[EP Rank %s/%s] Expert parallelism is enabled. Expert " + "placement strategy: %s. Local/global" + " number of experts: %s/%s. Experts local to global index map:" + " %s.", + self.ep_rank, + self.ep_size, + self.placement_strategy, + self.local_num_experts, + self.global_num_experts, + self.get_compressed_map_string(), + ) + + def _init_aiter_shared_experts_topK_buffer( + self, + dp_size: int, + top_k: int, + max_num_batched_tokens: int, + ): + if self.num_fused_shared_experts > 0: + init_aiter_topK_meta_data( + n_routed_experts=self.global_num_experts, + n_shared_experts=self.num_fused_shared_experts, + top_k=top_k, + tp_rank=self.ep_rank if self.use_ep else self.tp_rank, + tp_size=self.ep_size if self.use_ep else self.tp_size, + shared_experts_score=1.0, + max_num_tokens=max_num_batched_tokens * dp_size, + is_EP=self.use_ep, + ) + self._local_num_experts += self.num_fused_shared_experts + + @property + def use_ep(self) -> int: + return self.moe_parallel_config.use_ep + + @property + def ep_size(self) -> int: + return self.moe_parallel_config.ep_size + + @property + def ep_rank(self) -> int: + return self.moe_parallel_config.ep_rank + + @property + def tp_size(self) -> int: + return self.moe_parallel_config.tp_size + + @property + def tp_rank(self) -> int: + return self.moe_parallel_config.tp_rank + + @property + def local_num_experts(self) -> int: + return self._local_num_experts + + @property + def expert_map(self) -> torch.Tensor | None: + """ + Mapping from global expert ID to local expert ID. + + Returns tensor of shape (global_num_experts,) where: + - expert_map[global_id] = local_id if expert is on this rank + - expert_map[global_id] = -1 if expert is not on this rank + + Returns None if EP is not enabled (ep_size == 1). + """ + return self._expert_map + + @property + def expert_mask(self) -> torch.Tensor | None: + """ + Expert mask for AITER fusion (ROCm-specific). + + Returns tensor of shape (global_num_experts + num_fused_shared + 1,) + where 1 indicates expert is on this rank, 0 otherwise. + """ + return self._expert_mask + + @property + def placement_strategy(self) -> ExpertPlacementStrategy: + """Expert placement strategy ('linear' or 'round_robin').""" + return self._placement_strategy + + @property + def routing_tables( + self, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: + """ + Routing tables for round-robin placement. + + Returns (global_to_physical, physical_to_global, local_to_global) + or None if not using round-robin or tables not needed. + """ + if not hasattr(self, "_routing_tables"): + return None + return self._routing_tables + + def map_global_to_local(self, global_id: int) -> int: + """ + Map global expert ID to local expert ID. + + Args: + global_id: Global expert ID (0 to global_num_experts - 1) + + Returns: + Local expert ID (0 to local_num_experts - 1) + + Raises: + ValueError: If expert is not on this rank + """ + if self._expert_map is None: + return global_id + + return self._expert_map[global_id].item() + + def is_local_expert(self, global_id: int) -> bool: + """Check if expert is assigned to this rank.""" + if self._expert_map is None: + return True + return self._expert_map[global_id] != -1 + + def get_local_expert_ids(self) -> list[int]: + """Get list of global IDs for experts on this rank.""" + if self._expert_map is None: + return list(range(self.global_num_experts)) + + return torch.where(self._expert_map != -1)[0].tolist() + + def update( + self, + new_ep_size: int | None = None, + new_ep_rank: int | None = None, + ) -> None: + """ + Update expert mappings for new EP configuration. + + Used during dynamic reconfiguration (e.g., elastic scaling). + + Args: + new_ep_size: New EP world size (if changed) + new_ep_rank: New EP rank (if changed) + """ + if new_ep_size is not None: + self.moe_parallel_config.ep_size = new_ep_size + if new_ep_rank is not None: + self.moe_parallel_config.ep_rank = new_ep_rank + + # Recalculate everything + self._placement_strategy = self._determine_placement_strategy( + self._placement_strategy + ) + self._calculate_expert_maps() + self._maybe_init_routing_tables() + + def get_compressed_map_string(self) -> str: + """ + Get compressed string representation of expert map for logging. + + Returns string mapping local to global expert IDs. + """ + if self._expert_map is None: + return f"[0..{self.global_num_experts - 1}]" + + global_indices = torch.where(self._expert_map != -1)[0] + local_indices = self._expert_map[global_indices] + return ", ".join( + f"{local_index.item()}->{global_index.item()}" + for local_index, global_index in zip(local_indices, global_indices) + ) + + # Private methods + + def _determine_placement_strategy( + self, requested_strategy: ExpertPlacementStrategy + ) -> ExpertPlacementStrategy: + """Determine effective placement strategy based on config.""" + if requested_strategy != "round_robin": + return requested_strategy + + # Round-robin requires specific conditions + if self.ep_size == 1: + return "linear" + + if ( + self.moe_parallel_config.use_all2all_kernels + and not self.moe_parallel_config.use_deepep_ll_kernels + and not self.moe_parallel_config.use_nixl_ep_kernels + ): + logger.warning( + "Round-robin placement requires DeepEP-ll or NIXL backend. " + "Falling back to linear." + ) + return "linear" + + return "round_robin" + + def _calculate_expert_maps(self) -> None: + """Calculate expert mappings based on placement strategy.""" + if self.ep_size == 1: + # No EP, all experts are local + self._local_num_experts = self.global_num_experts + self._expert_map = None + self._expert_mask = None + return + + # Call determine_expert_map with current config + ( + self._local_num_experts, + self._expert_map, + self._expert_mask, + ) = determine_expert_map( + ep_size=self.ep_size, + ep_rank=self.ep_rank, + global_num_experts=self.global_num_experts, + expert_placement_strategy=self._placement_strategy, + num_fused_shared_experts=self.num_fused_shared_experts, + return_expert_mask=self.rocm_aiter_enabled, + ) + + # Move to device if specified + if self.device is not None: + if self._expert_map is not None: + self._expert_map = self._expert_map.to(self.device) + if self._expert_mask is not None: + self._expert_mask = self._expert_mask.to(self.device) + + def _maybe_init_routing_tables(self) -> None: + """Initialize routing tables if needed for round-robin.""" + if self._placement_strategy != "round_robin": + return + + if ( + not self.moe_parallel_config.use_deepep_ll_kernels + and not self.moe_parallel_config.use_nixl_ep_kernels + ): + return + + if self._expert_map is None: + return + + self._routing_tables = self._ensure_round_robin_expert_routing_tables() + + def _ensure_round_robin_expert_routing_tables( + self, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Build routing tables for round-robin placement.""" + device_kwargs = {"device": self.device} if self.device is not None else {} + + global_indices = torch.arange( + self.global_num_experts, dtype=torch.long, **device_kwargs + ) + owner = torch.remainder(global_indices, self.ep_size) + local_index = torch.div(global_indices, self.ep_size, rounding_mode="floor") + + base = self.global_num_experts // self.ep_size + remainder = self.global_num_experts % self.ep_size + physical_offset = owner * base + + if remainder > 0: + remainder_tensor = torch.tensor( + remainder, dtype=torch.long, **device_kwargs + ) + physical_offset = physical_offset + torch.minimum(owner, remainder_tensor) + + global_to_physical = physical_offset + local_index + physical_to_global = torch.empty_like(global_to_physical) + physical_to_global[global_to_physical] = global_indices + + local_global = torch.arange( + self.ep_rank, + self.global_num_experts, + self.ep_size, + dtype=torch.long, + **device_kwargs, + ) + if local_global.numel() != self._local_num_experts: + local_global = local_global[: self._local_num_experts] + + return (global_to_physical, physical_to_global, local_global) From ba52a86c21303614c0fc6fbd56ff7103c81431f7 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 23 Apr 2026 21:07:52 +0000 Subject: [PATCH 115/191] wip Signed-off-by: Bill Nell --- tests/distributed/test_expert_placement.py | 4 +- .../kernels/moe/test_moe_permute_unpermute.py | 4 +- .../layers/fused_moe/expert_map_manager.py | 17 + vllm/model_executor/layers/fused_moe/layer.py | 330 +++--------------- 4 files changed, 73 insertions(+), 282 deletions(-) diff --git a/tests/distributed/test_expert_placement.py b/tests/distributed/test_expert_placement.py index 8b3a64b9c134..46f63408f467 100644 --- a/tests/distributed/test_expert_placement.py +++ b/tests/distributed/test_expert_placement.py @@ -3,7 +3,9 @@ import pytest -from vllm.model_executor.layers.fused_moe.layer import determine_expert_map +from vllm.model_executor.layers.fused_moe.expert_map_manager import ( + determine_expert_map, +) def verify_round_robin_pattern(expert_map, ep_rank, ep_size, global_num_experts): diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py index 92126171a17b..5aafb89589fd 100644 --- a/tests/kernels/moe/test_moe_permute_unpermute.py +++ b/tests/kernels/moe/test_moe_permute_unpermute.py @@ -10,7 +10,9 @@ import torch from vllm.model_executor.layers.fused_moe import fused_topk -from vllm.model_executor.layers.fused_moe.layer import determine_expert_map +from vllm.model_executor.layers.fused_moe.expert_map_manager import ( + determine_expert_map, +) from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( moe_permute, moe_permute_unpermute_supported, diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 5364f4163102..e587a854ea23 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -365,6 +365,9 @@ def update( self, new_ep_size: int | None = None, new_ep_rank: int | None = None, + dp_size: int | None = None, + top_k: int | None = None, + max_num_batched_tokens: int | None = None, ) -> None: """ Update expert mappings for new EP configuration. @@ -374,6 +377,10 @@ def update( Args: new_ep_size: New EP world size (if changed) new_ep_rank: New EP rank (if changed) + dp_size: New DP size (if changed, for AITER buffer reinitialization) + top_k: New top_k (if changed, for AITER buffer reinitialization) + max_num_batched_tokens: New max batched tokens (if changed, for AITER + buffer reinitialization) """ if new_ep_size is not None: self.moe_parallel_config.ep_size = new_ep_size @@ -387,6 +394,16 @@ def update( self._calculate_expert_maps() self._maybe_init_routing_tables() + # Reinitialize AITER buffer if needed and parameters provided + if self.num_fused_shared_experts > 0 and all( + x is not None for x in [dp_size, top_k, max_num_batched_tokens] + ): + self._init_aiter_shared_experts_topK_buffer( + dp_size=dp_size, # type: ignore + top_k=top_k, # type: ignore + max_num_batched_tokens=max_num_batched_tokens, # type: ignore + ) + def get_compressed_map_string(self) -> str: """ Get compressed string representation of expert map for logging. diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 7174cdd88f25..5fc7c6cc5467 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -3,13 +3,13 @@ from collections.abc import Callable, Iterable from enum import Enum -from typing import Literal, cast, get_args, overload +from typing import Literal, cast, overload import torch from torch.nn.parameter import UninitializedParameter from vllm._aiter_ops import rocm_aiter_ops -from vllm.config import VllmConfig, get_current_vllm_config +from vllm.config import get_current_vllm_config from vllm.config.parallel import ExpertPlacementStrategy from vllm.distributed import ( get_dp_group, @@ -26,15 +26,15 @@ FusedMoEQuantConfig, RoutingMethodType, ) +from vllm.model_executor.layers.fused_moe.expert_map_manager import ( + ExpertMapManager, +) from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, ) from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import ( FusedMoEModularMethod, ) -from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( - init_aiter_topK_meta_data, -) from vllm.model_executor.layers.fused_moe.router.router_factory import ( create_fused_moe_router, ) @@ -68,152 +68,6 @@ class FusedMoeWeightScaleSupported(Enum): BLOCK = "block" -def determine_expert_map( - ep_size: int, - ep_rank: int, - global_num_experts: int, - expert_placement_strategy: ExpertPlacementStrategy = "linear", - num_fused_shared_experts: int = 0, - return_expert_mask: bool = False, -) -> tuple[int, torch.Tensor | None, torch.Tensor | None]: - """ - Calculates how many experts should be assigned to each rank for EP and - creates a mapping from global to local expert index. Experts are - distributed evenly across ranks. Any remaining are assigned to the - last rank. - - Args: - ep_size: The size of the expert parallel group - ep_rank: The rank of the current process in the expert parallel - group - global_num_experts: The total number of experts in the model. - expert_placement_strategy: The expert placement strategy. - - Returns: - tuple[int, Optional[torch.Tensor]]: A tuple containing: - - local_num_experts (int): The number of experts assigned - to the current rank. - - expert_map (Optional[torch.Tensor]): A tensor of shape - (global_num_experts,) mapping from global to local index. - Contains -1 for experts not assigned to the current rank. - Returns None if ep_size is 1. - - expert_mask (Optional[torch.Tensor]): A tensor of shape - (global_num_experts + num_fused_shared_experts + 1,) - containing 1 for experts assigned to the current rank - and 0 for sentinel. - Returns None if ep_size is 1. - Used only when AITER MOE is enabled. - """ - assert ep_size > 0 - if ep_size == 1: - return (global_num_experts, None, None) - - # Distribute experts as evenly as possible to each rank. - base_experts = global_num_experts // ep_size - remainder = global_num_experts % ep_size - local_num_experts = base_experts + 1 if ep_rank < remainder else base_experts - - # Create a tensor of size num_experts filled with -1 - expert_map = torch.full((global_num_experts,), -1, dtype=torch.int32) - # Create an expert map for the local experts - if expert_placement_strategy == "linear": - start_idx = ep_rank * base_experts + min(ep_rank, remainder) - expert_map[start_idx : start_idx + local_num_experts] = torch.arange( - 0, local_num_experts, dtype=torch.int32 - ) - elif expert_placement_strategy == "round_robin": - local_log_experts = torch.arange( - ep_rank, global_num_experts, ep_size, dtype=torch.int32 - ) - - expert_map[local_log_experts] = torch.arange( - 0, local_num_experts, dtype=torch.int32 - ) - else: - raise ValueError( - "Unsupported expert placement strategy " - f"'{expert_placement_strategy}', expected one of " - f"{get_args(ExpertPlacementStrategy)}" - ) - - expert_mask = None - if return_expert_mask: - expert_mask = torch.ones( - (global_num_experts + num_fused_shared_experts + 1,), dtype=torch.int32 - ) - expert_mask[-1] = 0 - expert_mask[:global_num_experts] = expert_map > -1 - expert_map = torch.cat( - ( - expert_map, - torch.tensor( - [local_num_experts + i for i in range(num_fused_shared_experts)], - dtype=torch.int32, - ), - ), - dim=0, - ) - - return (local_num_experts, expert_map, expert_mask) - - -def determine_expert_placement_strategy( - expert_placement_strategy: ExpertPlacementStrategy, - moe_parallel_config: FusedMoEParallelConfig, - num_expert_group: int | None, - num_redundant_experts: int, - enable_eplb: bool, -) -> ExpertPlacementStrategy: - if expert_placement_strategy == "round_robin": - round_robin_supported = ( - (num_expert_group is not None and num_expert_group > 1) - and num_redundant_experts == 0 - and not enable_eplb - ) - - if not round_robin_supported: - logger.warning( - "Round-robin expert placement is only supported for " - "models with multiple expert groups and no redundant " - "experts. Falling back to linear expert placement." - ) - return "linear" - if ( - moe_parallel_config.use_all2all_kernels - and not moe_parallel_config.needs_round_robin_routing_tables - ): - logger.warning( - "Round-robin expert placement currently only supports " - "the DeepEP low-latency or NIXL EP backend, but '%s' was configured. " - "Falling back to linear expert placement.", - moe_parallel_config.all2all_backend, - ) - return "linear" - - return expert_placement_strategy - - -def get_compressed_expert_map(expert_map: torch.Tensor) -> str: - """ - Compresses the expert map by removing any -1 entries. - - Args: - expert_map (torch.Tensor): A tensor of shape (global_num_experts,) - mapping from global to local index. Contains -1 for experts not - assigned to the current rank. - - Returns: - str: A string mapping from local to global index. - Using str to support hashing for logging once only. - """ - global_indices = torch.where(expert_map != -1)[0] - local_indices = expert_map[global_indices] - return ", ".join( - f"{local_index.item()}->{global_index.item()}" - for local_index, global_index in zip(local_indices, global_indices) - ) - - # --8<-- [start:fused_moe] @PluggableLayer.register("fused_moe") class FusedMoE(PluggableLayer): @@ -384,54 +238,34 @@ def __init__( "Redundant experts are only supported with EPLB." ) - self.expert_placement_strategy = determine_expert_placement_strategy( - expert_placement_strategy=self.expert_placement_strategy, - moe_parallel_config=self.moe_parallel_config, - num_expert_group=num_expert_group, - num_redundant_experts=num_redundant_experts, - enable_eplb=self.enable_eplb, - ) + # Create ExpertMapManager to handle expert mapping and placement + self.expert_map_manager = ExpertMapManager( + max_num_batched_tokens=vllm_config.scheduler_config.max_num_batched_tokens, + top_k=top_k, + global_num_experts=self.global_num_experts, + logical_num_experts=self.logical_num_experts, + num_redundant_experts=num_redundant_experts, + num_expert_group=num_expert_group, + moe_parallel_config=self.moe_parallel_config, + placement_strategy=self.expert_placement_strategy, + enable_eplb=self.enable_eplb, + num_fused_shared_experts=self.num_fused_shared_experts, + rocm_aiter_enabled=self.rocm_aiter_fmoe_enabled, + device=None, + ) - self._expert_map: torch.Tensor | None - local_num_experts, expert_map, expert_mask = determine_expert_map( - ep_size=self.ep_size, - ep_rank=self.ep_rank, - global_num_experts=self.global_num_experts, - expert_placement_strategy=self.expert_placement_strategy, - num_fused_shared_experts=self.num_fused_shared_experts, - return_expert_mask=self.rocm_aiter_fmoe_enabled, - ) - self.local_num_experts = local_num_experts - self.register_buffer("_expert_map", expert_map) - self.register_buffer("expert_mask", expert_mask) - self._maybe_init_expert_routing_tables() - logger.info_once( - "[EP Rank %s/%s] Expert parallelism is enabled. Expert " - "placement strategy: %s. Local/global" - " number of experts: %s/%s. Experts local to global index map:" - " %s.", - self.ep_rank, - self.ep_size, - self.expert_placement_strategy, - self.local_num_experts, - self.global_num_experts, - get_compressed_expert_map(self._expert_map), - ) - else: - self.local_num_experts, self._expert_map, self.expert_mask = ( - self.global_num_experts, - None, - None, - ) + # Extract properties from ExpertMapManager + self.local_num_experts = self.expert_map_manager.local_num_experts + self.expert_placement_strategy = self.expert_map_manager.placement_strategy + self.register_buffer("_expert_map", self.expert_map_manager.expert_map) + self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) self.top_k = top_k - self._init_aiter_shared_experts_topK_buffer( - vllm_config=vllm_config, dp_size=dp_size_ - ) + # AITER buffer initialization is handled by ExpertMapManager if self.use_ep and self.rocm_aiter_fmoe_enabled: assert self.expert_mask is None or torch.all( - (expert_mask == 0) | (expert_mask == 1) + (self.expert_mask == 0) | (self.expert_mask == 1) ), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s." assert intermediate_size % self.tp_size == 0 @@ -705,17 +539,10 @@ def _maybe_init_expert_routing_tables( ), ) - if self._expert_map is None: + routing_tables = self.expert_map_manager.routing_tables + if routing_tables is None: return None - routing_tables = self.ensure_round_robin_expert_routing_tables( - global_num_experts=self.global_num_experts, - ep_size=self.ep_size, - ep_rank=self.ep_rank, - local_num_experts=self.local_num_experts, - device=self._expert_map.device, - ) - global_to_physical, physical_to_global, local_global = routing_tables self.register_buffer("expert_global_to_physical", global_to_physical) self.register_buffer("expert_physical_to_global", physical_to_global) @@ -723,66 +550,28 @@ def _maybe_init_expert_routing_tables( return routing_tables - @staticmethod - def ensure_round_robin_expert_routing_tables( - global_num_experts: int, - ep_size: int, - ep_rank: int, - local_num_experts: int, - device: torch.device | None = None, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - device_kwargs = {"device": device} if device is not None else {} - global_indices = torch.arange( - global_num_experts, dtype=torch.long, **device_kwargs - ) - owner = torch.remainder(global_indices, ep_size) - local_index = torch.div(global_indices, ep_size, rounding_mode="floor") - base = global_num_experts // ep_size - remainder = global_num_experts % ep_size - physical_offset = owner * base - if remainder > 0: - remainder_tensor = torch.tensor( - remainder, dtype=torch.long, **device_kwargs - ) - physical_offset = physical_offset + torch.minimum(owner, remainder_tensor) - - global_to_physical = physical_offset + local_index - physical_to_global = torch.empty_like(global_to_physical) - physical_to_global[global_to_physical] = global_indices - - local_global = torch.arange( - ep_rank, - global_num_experts, - ep_size, - dtype=torch.long, - **device_kwargs, - ) - if local_global.numel() != local_num_experts: - local_global = local_global[:local_num_experts] - - return (global_to_physical, physical_to_global, local_global) - def update_expert_map(self): # ep_size and ep_rank should already be updated - assert self._expert_map is not None - with self._expert_map.device: - local_num_experts, expert_map, expert_mask = determine_expert_map( - ep_size=self.ep_size, - ep_rank=self.ep_rank, - global_num_experts=self.global_num_experts, - expert_placement_strategy=self.expert_placement_strategy, - num_fused_shared_experts=self.num_fused_shared_experts, - return_expert_mask=self.rocm_aiter_fmoe_enabled, - ) - self.local_num_experts = local_num_experts - self.register_buffer("_expert_map", expert_map) - self.register_buffer("expert_mask", expert_mask) - self._maybe_init_expert_routing_tables() - if self.aiter_fmoe_shared_expert_enabled: - self._init_aiter_shared_experts_topK_buffer( - vllm_config=get_current_vllm_config(), - dp_size=get_dp_group().world_size, - ) + # Update ExpertMapManager with new EP configuration + vllm_config = get_current_vllm_config() + self.expert_map_manager.update( + new_ep_size=self.ep_size, + new_ep_rank=self.ep_rank, + dp_size=get_dp_group().world_size + if self.aiter_fmoe_shared_expert_enabled + else None, + top_k=self.top_k if self.aiter_fmoe_shared_expert_enabled else None, + max_num_batched_tokens=vllm_config.scheduler_config.max_num_batched_tokens + if self.aiter_fmoe_shared_expert_enabled + else None, + ) + + # Update local attributes from ExpertMapManager + self.local_num_experts = self.expert_map_manager.local_num_experts + self.expert_placement_strategy = self.expert_map_manager.placement_strategy + self.register_buffer("_expert_map", self.expert_map_manager.expert_map) + self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) + self._maybe_init_expert_routing_tables() def _load_per_tensor_weight_scale( self, @@ -1050,26 +839,7 @@ def _load_g_idx( expert_data.copy_(loaded_weight) def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: - if self._expert_map is None: - return expert_id - return self._expert_map[expert_id].item() - - def _init_aiter_shared_experts_topK_buffer( - self, vllm_config: VllmConfig, dp_size: int - ): - if self.num_fused_shared_experts > 0: - init_aiter_topK_meta_data( - n_routed_experts=self.global_num_experts, - n_shared_experts=self.num_fused_shared_experts, - top_k=self.top_k, - tp_rank=self.ep_rank if self.use_ep else self.tp_rank, - tp_size=self.ep_size if self.use_ep else self.tp_size, - shared_experts_score=1.0, - max_num_tokens=vllm_config.scheduler_config.max_num_batched_tokens - * dp_size, - is_EP=self.use_ep, - ) - self.local_num_experts += self.num_fused_shared_experts + return self.expert_map_manager.map_global_to_local(expert_id) @overload def weight_loader( From 64cf4acd00ef0e02bc68921dd4ab47ca7688765d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 27 Apr 2026 20:01:02 +0000 Subject: [PATCH 116/191] update Signed-off-by: Bill Nell --- .../layers/fused_moe/expert_map_manager.py | 17 ++++++++++--- vllm/model_executor/layers/fused_moe/layer.py | 24 +++++++++++++------ 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index e587a854ea23..5f2cac803ee3 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -476,8 +476,13 @@ def _calculate_expert_maps(self) -> None: if self._expert_mask is not None: self._expert_mask = self._expert_mask.to(self.device) - def _maybe_init_routing_tables(self) -> None: - """Initialize routing tables if needed for round-robin.""" + def ensure_routing_tables_initialized(self) -> None: + """ + Ensure routing tables are initialized if needed for round-robin. + + This is a public method that can be called to explicitly initialize + routing tables. It's safe to call multiple times (idempotent). + """ if self._placement_strategy != "round_robin": return @@ -490,7 +495,13 @@ def _maybe_init_routing_tables(self) -> None: if self._expert_map is None: return - self._routing_tables = self._ensure_round_robin_expert_routing_tables() + # Only initialize if not already initialized + if not hasattr(self, "_routing_tables"): + self._routing_tables = self._ensure_round_robin_expert_routing_tables() + + def _maybe_init_routing_tables(self): + """Initialize routing tables if needed for round-robin (internal).""" + self.ensure_routing_tables_initialized() def _ensure_round_robin_expert_routing_tables( self, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 5fc7c6cc5467..5dda85d74c15 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -262,12 +262,6 @@ def __init__( self.top_k = top_k - # AITER buffer initialization is handled by ExpertMapManager - if self.use_ep and self.rocm_aiter_fmoe_enabled: - assert self.expert_mask is None or torch.all( - (self.expert_mask == 0) | (self.expert_mask == 1) - ), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s." - assert intermediate_size % self.tp_size == 0 intermediate_size_per_partition = intermediate_size // self.tp_size self.renormalize = renormalize @@ -539,10 +533,15 @@ def _maybe_init_expert_routing_tables( ), ) + # Explicitly ensure routing tables are initialized in ExpertMapManager + self.expert_map_manager._maybe_init_routing_tables() + + # Get routing tables from ExpertMapManager routing_tables = self.expert_map_manager.routing_tables if routing_tables is None: return None + # Register routing tables as buffers for this layer global_to_physical, physical_to_global, local_global = routing_tables self.register_buffer("expert_global_to_physical", global_to_physical) self.register_buffer("expert_physical_to_global", physical_to_global) @@ -553,6 +552,9 @@ def _maybe_init_expert_routing_tables( def update_expert_map(self): # ep_size and ep_rank should already be updated # Update ExpertMapManager with new EP configuration + # Note: ExpertMapManager.update() recalculates expert maps and + # reinitializes routing tables internally, so no need to call + # _maybe_init_expert_routing_tables() again vllm_config = get_current_vllm_config() self.expert_map_manager.update( new_ep_size=self.ep_size, @@ -571,7 +573,15 @@ def update_expert_map(self): self.expert_placement_strategy = self.expert_map_manager.placement_strategy self.register_buffer("_expert_map", self.expert_map_manager.expert_map) self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) - self._maybe_init_expert_routing_tables() + + # Update routing table buffers if they exist + # Note: Routing tables are already initialized by ExpertMapManager.update() + routing_tables = self.expert_map_manager.routing_tables + if routing_tables is not None: + global_to_physical, physical_to_global, local_global = routing_tables + self.register_buffer("expert_global_to_physical", global_to_physical) + self.register_buffer("expert_physical_to_global", physical_to_global) + self.register_buffer("expert_local_to_global", local_global) def _load_per_tensor_weight_scale( self, From 42c7fc4b009c4bb82d9b36a7723116c2ad31b749 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 27 Apr 2026 20:12:40 +0000 Subject: [PATCH 117/191] merge Signed-off-by: Bill Nell --- .../layers/fused_moe/expert_map_manager.py | 6 ++---- vllm/model_executor/layers/fused_moe/layer.py | 12 ++++-------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 5f2cac803ee3..7438c8330c85 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -483,13 +483,11 @@ def ensure_routing_tables_initialized(self) -> None: This is a public method that can be called to explicitly initialize routing tables. It's safe to call multiple times (idempotent). """ + # Only needed for round-robin with DeepEP-ll or NIXL EP backends if self._placement_strategy != "round_robin": return - if ( - not self.moe_parallel_config.use_deepep_ll_kernels - and not self.moe_parallel_config.use_nixl_ep_kernels - ): + if not self.moe_parallel_config.needs_round_robin_routing_tables: return if self._expert_map is None: diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 5dda85d74c15..13610e1ef3bd 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -516,13 +516,7 @@ def is_internal_router(self) -> bool: def _maybe_init_expert_routing_tables( self, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: - # Currently routing_tables only needed for round-robin expert placement - # with DeepEP-ll or NIXL EP all2all backends. - if self.expert_placement_strategy != "round_robin" or ( - not self.moe_parallel_config.needs_round_robin_routing_tables - ): - return None - + # Return cached routing tables if already registered as buffers if hasattr(self, "expert_global_to_physical"): return cast( tuple[torch.Tensor, torch.Tensor, torch.Tensor], @@ -533,7 +527,9 @@ def _maybe_init_expert_routing_tables( ), ) - # Explicitly ensure routing tables are initialized in ExpertMapManager + # Delegate to ExpertMapManager to initialize routing tables if needed + # (ExpertMapManager determines if routing tables are needed based on + # placement strategy and backend configuration) self.expert_map_manager._maybe_init_routing_tables() # Get routing tables from ExpertMapManager From 9fe13927d6bdbff2ac3be1a6f41d23e49fac31a7 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 27 Apr 2026 20:54:37 +0000 Subject: [PATCH 118/191] eplb manager Signed-off-by: Bill Nell --- .../layers/fused_moe/eplb_manager.py | 246 ++++++++++++++++++ vllm/model_executor/layers/fused_moe/layer.py | 184 +++++-------- 2 files changed, 308 insertions(+), 122 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/eplb_manager.py diff --git a/vllm/model_executor/layers/fused_moe/eplb_manager.py b/vllm/model_executor/layers/fused_moe/eplb_manager.py new file mode 100644 index 000000000000..68137ccb01bc --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/eplb_manager.py @@ -0,0 +1,246 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +EPLB (Expert Parallelism Load Balancing) Manager. + +This module provides the EplbManager class which encapsulates all EPLB-related +functionality for MoE layers, including state management, expert weight +collection, and expert parameter mapping. +""" + +from collections.abc import Iterable + +import torch + +from vllm.distributed.eplb.eplb_state import EplbLayerState, EplbState + + +class EplbManager: + """ + Manages Expert Parallelism Load Balancing (EPLB) state and operations + for a MoE layer. + + This class encapsulates all EPLB-related functionality including: + - Runtime state (expert load view, logical-to-physical mapping) + - Expert weight collection for load balancing + - Expert parameter mapping for weight loading with redundant experts + - Validation of EPLB configuration constraints + """ + + def __init__( + self, + ep_size: int, + global_num_experts: int, + logical_num_experts: int, + num_redundant_experts: int = 0, + ): + """ + Initialize EPLB manager. + + Args: + ep_size: Expert parallel world size + global_num_experts: Total number of experts (including redundant) + logical_num_experts: Number of logical (non-redundant) experts + num_redundant_experts: Number of redundant experts + """ + self.ep_size = ep_size + self.global_num_experts = global_num_experts + self.logical_num_experts = logical_num_experts + self.num_redundant_experts = num_redundant_experts + + # Runtime EPLB state. + self.state = EplbLayerState() + + # Validate EPLB configuration. + # EPLB currently only supports even distribution of experts across ranks + if self.global_num_experts % self.ep_size != 0: + raise ValueError( + f"EPLB currently only supports even distribution of " + f"experts across ranks. Got {self.global_num_experts} experts " + f"and {self.ep_size} EP ranks." + ) + + def set_state( + self, + moe_layer_idx: int, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + """ + Register the EPLB state for this layer. + + This is used later in forward pass, where we get the expert mapping + and record the load metrics in `expert_load_view`. + + Args: + moe_layer_idx: Index of this MoE layer + expert_load_view: View into global expert load tracking tensor + logical_to_physical_map: Mapping from logical to physical expert IDs + logical_replica_count: Number of replicas for each logical expert + """ + self.state.expert_load_view = expert_load_view[moe_layer_idx] + self.state.logical_to_physical_map = logical_to_physical_map[moe_layer_idx] + self.state.logical_replica_count = logical_replica_count[moe_layer_idx] + + @staticmethod + def get_expert_weights( + layer: torch.nn.Module, # FusedMoE + ) -> Iterable[torch.Tensor]: + """ + Collect expert weights from the MoE layer for EPLB. + + Returns weights reshaped as (local_num_experts, -1) for efficient + expert weight swapping during load balancing. + + Args: + layer: The FusedMoE layer to collect weights from + + Returns: + Iterable of expert weight tensors + """ + + def _maybe_make_contiguous( + name: str, p: torch.nn.Parameter + ) -> torch.nn.Parameter: + """ + In some cases, the last 2 dimensions (the non-expert dimensions) + of the weight scale tensor are transposed. This function + transforms the tensor (view update) so the tensor is contiguous(). + Example: A non-contiguous scale tensor, + `x` of shape (E, 32, 16) and stride (512, 1, 32) is transformed to + `x_` of shape (E, 16, 32) and stride (512, 32, 1). + Note that we specifically use torch.transpose() so `x_` refers + to the same underlying memory. The tensors `x` and `x_`, pointing + to the same underlying memory make this transformation safe in the + context of EPLB. i.e. It is the same memory and just the view + is different. + Note: This function handles the "weight_scale" tensors specifically. + This could however be generalized to handle similar tensors. + """ + if p.ndim != 3: + return p + if p.is_contiguous(): + # Already contiguous. do nothing. + return p + # p is non-contiguous. We only handle the case where the last 2 + # dimensions of the scales tensor is transposed. We can handle + # other cases when they become relevant. + is_transposed_12 = p.stride(1) == 1 and p.stride(2) != 1 + if "weight_scale" not in name or not is_transposed_12: + # do nothing. + return p + + # Do not update the layer parameter as the layer's MoE operations would + # expect the parameter's tensor to the same shape / stride. Instead, + # make a new torch.nn.Parameter that is used just in the context of + # EPLB. + return torch.nn.Parameter( + torch.transpose(p.data, 1, 2), requires_grad=False + ) + + weights = list(layer.named_parameters()) + weights = [(name, _maybe_make_contiguous(name, p)) for name, p in weights] + + # `w13_input_scale` and `w2_input_scale` are global per-tensor + # activation scales shared across all experts (e.g. NVFP4). + # They are broadcast views (stride 0) from .expand() and are + # not actual expert weights, so exclude them from EPLB. + NON_EXPERT_WEIGHTS = { + "e_score_correction_bias", + "w13_input_scale", + "w2_input_scale", + } + + assert all( + weight.is_contiguous() + for name, weight in weights + if not ( + name.startswith("_shared_experts.") + or name.startswith("_gate.") + or name.startswith("_routed_input_transform.") + or name.startswith("_routed_output_transform.") + ) + and name not in NON_EXPERT_WEIGHTS + ) + + return [ + weight.view(layer.local_num_experts, -1) + for name, weight in weights + if name not in NON_EXPERT_WEIGHTS + and weight.shape != torch.Size([]) + and not name.startswith("_shared_experts.") + # exclude parameters from non-expert submodules, + # e.g. gate/shared/transforms. + and not name.startswith("_gate.") + and not name.startswith("_routed_input_transform.") + and not name.startswith("_routed_output_transform.") + ] + + @staticmethod + def make_expert_params_mapping( + model: torch.nn.Module, + ckpt_gate_proj_name: str, + ckpt_down_proj_name: str, + ckpt_up_proj_name: str, + num_experts: int, + num_redundant_experts: int = 0, + ) -> list[tuple[str, str, int, str]]: + """ + Create expert parameter mapping for weight loading with redundant experts. + + This mapping handles the physical-to-logical expert ID conversion needed + when loading weights with EPLB redundant experts. + + Args: + model: The model containing the MoE layer + ckpt_gate_proj_name: Name of gate projection in checkpoint + ckpt_down_proj_name: Name of down projection in checkpoint + ckpt_up_proj_name: Name of up projection in checkpoint + num_experts: Number of logical (non-redundant) experts + num_redundant_experts: Number of redundant experts + + Returns: + List of tuples (param_name, weight_name, expert_id, shard_id) + where: + - param_name: Parameter name in the layer + - weight_name: Weight name in checkpoint + - expert_id: Physical expert ID + - shard_id: Shard identifier (w1, w2, w3) + """ + num_physical_experts = num_experts + num_redundant_experts + + # In the returned mapping: + # - `expert_id` is the physical expert id + # - `weight_name` contains the weight name of the logical expert + # So that we should map the expert id to logical in `weight_name` + physical_to_logical_map = ( + EplbState.build_initial_global_physical_to_logical_map( + num_experts, num_redundant_experts + ) + ) + + base_layer = ( + "base_layer." + if any(".base_layer." in name for name, _ in model.named_parameters()) + else "" + ) + + return [ + # (param_name, weight_name, expert_id, shard_id) + ( + f"experts.{base_layer}w13_" + if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name] + else f"experts.{base_layer}w2_", + f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}", + expert_id, + shard_id, + ) + for expert_id in range(num_physical_experts) + for shard_id, weight_name in [ + ("w1", ckpt_gate_proj_name), + ("w2", ckpt_down_proj_name), + ("w3", ckpt_up_proj_name), + ] + ] diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 7174cdd88f25..668a848f3589 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -16,7 +16,6 @@ get_pcp_group, get_tensor_model_parallel_world_size, ) -from vllm.distributed.eplb.eplb_state import EplbLayerState, EplbState from vllm.logger import init_logger from vllm.model_executor.custom_op import PluggableLayer from vllm.model_executor.layers.fused_moe.activation import MoEActivation @@ -26,6 +25,7 @@ FusedMoEQuantConfig, RoutingMethodType, ) +from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, ) @@ -342,12 +342,25 @@ def __init__( self.layer_name = prefix self.enable_eplb = enable_eplb - # TODO(bnell): should this be owned by router? - self.eplb_state = EplbLayerState() self.expert_placement_strategy: ExpertPlacementStrategy = ( vllm_config.parallel_config.expert_placement_strategy ) + # Create EPLB manager (always constructed for consistent API) + self.eplb_manager: EplbManager | None = None + if enable_eplb: + self.eplb_manager = EplbManager( + ep_size=self.moe_parallel_config.ep_size, + global_num_experts=self.global_num_experts, + logical_num_experts=self.logical_num_experts, + num_redundant_experts=num_redundant_experts, + ) + else: + # EPLB validation is handled by EplbManager.__init__ + assert not self.use_ep or num_redundant_experts == 0, ( + "Redundant experts are only supported with EPLB." + ) + # ROCm aiter shared experts fusion # AITER only supports gated activations (silu/gelu), so disable it # for non-gated MoE (is_act_and_mul=False) @@ -374,16 +387,6 @@ def __init__( # Determine expert maps if self.use_ep: - if self.enable_eplb: - assert self.global_num_experts % self.ep_size == 0, ( - "EPLB currently only supports even distribution of " - "experts across ranks." - ) - else: - assert num_redundant_experts == 0, ( - "Redundant experts are only supported with EPLB." - ) - self.expert_placement_strategy = determine_expert_placement_strategy( expert_placement_strategy=self.expert_placement_strategy, moe_parallel_config=self.moe_parallel_config, @@ -1435,82 +1438,18 @@ def load_weights( yield param_name def get_expert_weights(self) -> Iterable[torch.Tensor]: - def _maybe_make_contiguous( - name: str, p: torch.nn.Parameter - ) -> torch.nn.Parameter: - """ - In some cases, the last 2 dimensions (the non-expert dimensions) - of the weight scale tensor are transposed. This function - transforms the tensor (view update) so the tensor is contiguous(). - Example: A non-contiguous scale tensor, - `x` of shape (E, 32, 16) and stride (512, 1, 32) is transformed to - `x_` of shape (E, 16, 32) and stride (512, 32, 1). - Note that we specifically use torch.transpose() so `x_` refers - to the same underlying memory. The tensors `x` and `x_`, pointing - to the same underlying memory make this transformation safe in the - context of EPLB. i.e. It is the same memory and just the view - is different. - Note: This function handles the "weight_scale" tensors specifically. - This could however be generalized to handle similar tensors. - """ - if p.ndim != 3: - return p - if p.is_contiguous(): - # Already contiguous. do nothing. - return p - # p is non-contiguous. We only handle the case where the last 2 - # dimensions of the scales tensor is transposed. We can handle - # other cases when they become relevant. - is_transposed_12 = p.stride(1) == 1 and p.stride(2) != 1 - if "weight_scale" not in name or not is_transposed_12: - # do nothing. - return p - - # Do not update the layer parameter as the layer's MoE operations would - # expect the parameter's tensor to the same shape / stride. Instead, - # make a new torch.nn.Parameter that is used just in the context of - # EPLB. - return torch.nn.Parameter( - torch.transpose(p.data, 1, 2), requires_grad=False - ) + """ + Collect expert weights for EPLB load balancing. - weights = list(self.named_parameters()) - weights = [(name, _maybe_make_contiguous(name, p)) for name, p in weights] - - # `w13_input_scale` and `w2_input_scale` are global per-tensor - # activation scales shared across all experts (e.g. NVFP4). - # They are broadcast views (stride 0) from .expand() and are - # not actual expert weights, so exclude them from EPLB. - NON_EXPERT_WEIGHTS = { - "e_score_correction_bias", - "w13_input_scale", - "w2_input_scale", - } + Returns weights reshaped as (local_num_experts, -1) for efficient + expert weight swapping during load balancing. - assert all( - weight.is_contiguous() - for name, weight in weights - if not ( - name.startswith("_shared_experts.") - or name.startswith("_gate.") - or name.startswith("_routed_input_transform.") - or name.startswith("_routed_output_transform.") - ) - and name not in NON_EXPERT_WEIGHTS - ) + Delegates to EplbManager. - return [ - weight.view(self.local_num_experts, -1) - for name, weight in weights - if name not in NON_EXPERT_WEIGHTS - and weight.shape != torch.Size([]) - and not name.startswith("_shared_experts.") - # exclude parameters from non-expert submodules, - # e.g. gate/shared/transforms. - and not name.startswith("_gate.") - and not name.startswith("_routed_input_transform.") - and not name.startswith("_routed_output_transform.") - ] + Returns: + Iterable of expert weight tensors + """ + return EplbManager.get_expert_weights(self) def set_eplb_state( self, @@ -1524,10 +1463,22 @@ def set_eplb_state( This is used later in forward pass, where we get the expert mapping and record the load metrics in `expert_load_view`. + + Delegates to EplbManager for state management. + + Args: + moe_layer_idx: Index of this MoE layer + expert_load_view: View into global expert load tracking tensor + logical_to_physical_map: Mapping from logical to physical expert IDs + logical_replica_count: Number of replicas for each logical expert """ - self.eplb_state.expert_load_view = expert_load_view[moe_layer_idx] - self.eplb_state.logical_to_physical_map = logical_to_physical_map[moe_layer_idx] - self.eplb_state.logical_replica_count = logical_replica_count[moe_layer_idx] + if self.eplb_manager is not None: + self.eplb_manager.set_state( + moe_layer_idx, + expert_load_view, + logical_to_physical_map, + logical_replica_count, + ) def ensure_moe_quant_config_init(self): if self.quant_method.moe_quant_config is None: @@ -1570,41 +1521,30 @@ def make_expert_params_mapping( num_experts: int, num_redundant_experts: int = 0, ) -> list[tuple[str, str, int, str]]: - num_physical_experts = num_experts + num_redundant_experts - - # In the returned mapping: - # - `expert_id` is the physical expert id - # - `weight_name` contains the weight name of the logical expert - # So that we should map the expert id to logical in `weight_name` - physical_to_logical_map = ( - EplbState.build_initial_global_physical_to_logical_map( - num_experts, num_redundant_experts - ) - ) + """ + Create expert parameter mapping for weight loading. - base_layer = ( - "base_layer." - if any(".base_layer." in name for name, _ in model.named_parameters()) - else "" - ) + Delegates to EplbManager for proper handling of redundant experts. - return [ - # (param_name, weight_name, expert_id, shard_id) - ( - f"experts.{base_layer}w13_" - if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name] - else f"experts.{base_layer}w2_", - f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}", - expert_id, - shard_id, - ) - for expert_id in range(num_physical_experts) - for shard_id, weight_name in [ - ("w1", ckpt_gate_proj_name), - ("w2", ckpt_down_proj_name), - ("w3", ckpt_up_proj_name), - ] - ] + Args: + model: The model containing the MoE layer + ckpt_gate_proj_name: Name of gate projection in checkpoint + ckpt_down_proj_name: Name of down projection in checkpoint + ckpt_up_proj_name: Name of up projection in checkpoint + num_experts: Number of logical (non-redundant) experts + num_redundant_experts: Number of redundant experts + + Returns: + List of tuples (param_name, weight_name, expert_id, shard_id) + """ + return EplbManager.make_expert_params_mapping( + model, + ckpt_gate_proj_name, + ckpt_down_proj_name, + ckpt_up_proj_name, + num_experts, + num_redundant_experts, + ) @property def hidden_size(self) -> int: From 819a2ddb94ae995c9c79ecadfbf52ec3f39bbd58 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 27 Apr 2026 21:35:17 +0000 Subject: [PATCH 119/191] fixes Signed-off-by: Bill Nell --- .../compressed_tensors/compressed_tensors.py | 7 ------- .../schemes/compressed_tensors_wNa16.py | 2 -- .../layers/quantization/experts_int8.py | 2 +- vllm/model_executor/layers/quantization/fp8.py | 13 ++++++------- 4 files changed, 7 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 8d6f7441e635..3308dcd44286 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -159,12 +159,9 @@ def get_quant_method( layer: torch.nn.Module, prefix: str, ) -> "QuantizeMethodBase | None": - print(f"GOT HERE {layer.__class__, isinstance(layer, LinearBase)}") - if isinstance(layer, LinearBase): # collect schemes quant_scheme = self.get_scheme(layer=layer, layer_name=prefix) - print(f"GOT HERE QS {quant_scheme}") input_tfms, output_tfms = get_linear_transform_schemes( layer, prefix, self.transform_config, self.packed_modules_mapping ) @@ -187,9 +184,7 @@ def get_quant_method( if isinstance(layer, ParallelLMHead): try: quant_scheme = self.get_scheme(layer=layer, layer_name=prefix) - print(f"FOUND QS {quant_scheme}") except ValueError: - print(f"FAILED FOUND QS {quant_scheme}") quant_scheme = None if quant_scheme is not None: layer.scheme = quant_scheme @@ -201,7 +196,6 @@ def get_quant_method( return CompressedTensorsMoEMethod.get_moe_method( self, layer, layer_name=prefix ) - print("NEVER!!!!!!!!!!!!!!!!!!!!!") return None def _add_fused_moe_to_target_scheme_map(self): @@ -934,7 +928,6 @@ def create_weights( details """ weight_loader = extra_weight_attrs.get("weight_loader") - print(f"SCHEME = {layer.__class__, layer.scheme, layer.prefix}") layer.scheme.create_weights( layer=layer, input_size=input_size, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index 08eafdc48215..1883d4ae322c 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -106,8 +106,6 @@ def create_weights( has_g_idx=self.has_g_idx, ) - print(f"LLC {mp_linear_kernel_config}") - kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config) if kernel_type.__name__ not in self._kernel_backends_being_used: diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 8c8ec8d1dd59..3db2916055ca 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -56,5 +56,5 @@ def get_quant_method( if isinstance(layer, LinearBase): return UnquantizedLinearMethod() elif isinstance(layer, RoutedExperts): - return Int8OnlineMoEMethod(layer=layer.moe_config) + return Int8OnlineMoEMethod(layer=layer) return None diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 8d10c7c5d6b3..51912f3915f8 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -18,7 +18,6 @@ from vllm.model_executor.kernels.linear.scaled_mm import MarlinFP8ScaledMMLinearKernel from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import ( - FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported, RoutedExperts, @@ -196,9 +195,9 @@ def get_quant_method( ): return UnquantizedFusedMoEMethod(layer.moe_config) if self.is_checkpoint_fp8_serialized: - moe_quant_method = Fp8MoEMethod(self, layer.moe_config) + moe_quant_method = Fp8MoEMethod(self, layer) else: - moe_quant_method = Fp8OnlineMoEMethod(self, layer.moe_config) + moe_quant_method = Fp8OnlineMoEMethod(self, layer) return moe_quant_method elif isinstance(layer, Attention): return Fp8KVCacheMethod(self) @@ -569,8 +568,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): quant_config: The quantization config. """ - def __init__(self, quant_config: Fp8Config, moe_config: FusedMoEConfig): - super().__init__(moe_config) + def __init__(self, quant_config: Fp8Config, layer: RoutedExperts): + super().__init__(layer.moe_config) self.quant_config = quant_config self.weight_block_size = self.quant_config.weight_block_size self.block_quant: bool = self.weight_block_size is not None @@ -927,8 +926,8 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod): uses_meta_device: bool = True - def __init__(self, quant_config: Fp8Config, moe_config: FusedMoEConfig): - super().__init__(quant_config, moe_config) + def __init__(self, quant_config: Fp8Config, layer: RoutedExperts): + super().__init__(quant_config, layer) assert not quant_config.is_checkpoint_fp8_serialized assert quant_config.activation_scheme == "dynamic" assert quant_config.weight_block_size is None From 933147767d3012a623f464b72597c6edc10649d4 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 27 Apr 2026 21:51:18 +0000 Subject: [PATCH 120/191] eplb manager Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 8 ++-- .../layers/fused_moe/router/base_router.py | 47 +++++++++++-------- .../fused_moe/router/custom_routing_router.py | 11 +++-- .../router/fused_topk_bias_router.py | 11 +++-- .../fused_moe/router/fused_topk_router.py | 11 +++-- .../fused_moe/router/grouped_topk_router.py | 11 +++-- .../layers/fused_moe/router/router_factory.py | 31 +++++------- .../router/routing_simulator_router.py | 13 ++--- .../fused_moe/router/zero_expert_router.py | 11 +++-- .../compressed_tensors_moe_w4a8_fp8.py | 4 -- .../compressed_tensors_moe_w4a8_int8.py | 1 - 11 files changed, 80 insertions(+), 79 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 668a848f3589..069215bd4cc3 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -341,7 +341,6 @@ def __init__( compilation_config.static_all_moe_layers.append(prefix) self.layer_name = prefix - self.enable_eplb = enable_eplb self.expert_placement_strategy: ExpertPlacementStrategy = ( vllm_config.parallel_config.expert_placement_strategy ) @@ -392,7 +391,7 @@ def __init__( moe_parallel_config=self.moe_parallel_config, num_expert_group=num_expert_group, num_redundant_experts=num_redundant_experts, - enable_eplb=self.enable_eplb, + enable_eplb=enable_eplb, ) self._expert_map: torch.Tensor | None @@ -470,7 +469,7 @@ def __init__( self.router = create_fused_moe_router( top_k=top_k, global_num_experts=self.global_num_experts, - eplb_state=self.eplb_state, + eplb_manager=self.eplb_manager, renormalize=renormalize, use_grouped_topk=use_grouped_topk, num_expert_group=num_expert_group, @@ -480,7 +479,6 @@ def __init__( routed_scaling_factor=self.routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, num_fused_shared_experts=self.num_fused_shared_experts, - enable_eplb=enable_eplb, # TODO(bnell): once we can construct the MK at init time, we # can make this a value. indices_type_getter=lambda: self.quant_method.topk_indices_dtype, @@ -546,7 +544,7 @@ def _get_quant_method() -> FusedMoEMethodBase: "is_act_and_mul=False is supported only for CUDA and ROCm for now" ) - if self.enable_eplb and not self.quant_method.supports_eplb: + if enable_eplb and not self.quant_method.supports_eplb: # TODO: Add support for additional quantization methods. # The implementation for other quantization methods does not # contain essential differences, but the current quant API diff --git a/vllm/model_executor/layers/fused_moe/router/base_router.py b/vllm/model_executor/layers/fused_moe/router/base_router.py index 0138eb59c91c..18f19a6eacf1 100644 --- a/vllm/model_executor/layers/fused_moe/router/base_router.py +++ b/vllm/model_executor/layers/fused_moe/router/base_router.py @@ -2,16 +2,19 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import abstractmethod from collections.abc import Callable +from typing import TYPE_CHECKING import torch -from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( FusedMoERouter, ) from vllm.platforms import current_platform from vllm.triton_utils import tl, triton +if TYPE_CHECKING: + from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager + if current_platform.is_cuda_alike(): @triton.jit @@ -148,8 +151,7 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_state: EplbLayerState, - enable_eplb: bool = False, + eplb_manager: EplbManager | None = None, # TODO(bnell): Once the MK is constructed at layer init time, we # can make this a plain value instead of a callback. indices_type_getter: Callable[[], torch.dtype | None] | None = None, @@ -159,12 +161,17 @@ def __init__( time, so we need to supply a callback to get it at runtime. This is because the indices type is supplied by modular kernels which are created after MoE layer/router construction. + + Args: + top_k: Number of experts to select per token + global_num_experts: Total number of experts + eplb_manager: Optional EPLB manager for load balancing + indices_type_getter: Optional callback to get indices dtype """ super().__init__() self.top_k = top_k self.global_num_experts = global_num_experts - self.eplb_state = eplb_state - self.enable_eplb = enable_eplb + self.eplb_manager = eplb_manager self.indices_type_getter = indices_type_getter self.capture_fn: Callable[[torch.Tensor], None] | None = None @@ -174,18 +181,19 @@ def set_capture_fn(self, capture_fn: Callable[[torch.Tensor], None] | None) -> N def _validate_eplb_state(self) -> None: """Validate that EPLB state is properly initialized if EPLB is enabled.""" - if self.enable_eplb: - if self.eplb_state.expert_load_view is None: + if self.eplb_manager is not None: + eplb_state = self.eplb_manager.state + if eplb_state.expert_load_view is None: raise ValueError("enable_eplb=True requires expert_load_view != None") - if self.eplb_state.logical_to_physical_map is None: + if eplb_state.logical_to_physical_map is None: raise ValueError( "enable_eplb=True requires logical_to_physical_map != None" ) - if self.eplb_state.logical_replica_count is None: + if eplb_state.logical_replica_count is None: raise ValueError( "enable_eplb=True requires logical_replica_count != None" ) - if self.eplb_state.should_record_tensor is None: + if eplb_state.should_record_tensor is None: raise ValueError( "enable_eplb=True requires should_record_tensor != None" ) @@ -198,17 +206,18 @@ def _get_indices_type(self) -> torch.dtype | None: def _apply_eplb_mapping(self, topk_ids: torch.Tensor) -> torch.Tensor: """Apply EPLB mapping to convert logical expert IDs to physical expert IDs.""" - if self.enable_eplb: - assert self.eplb_state.expert_load_view is not None - assert self.eplb_state.logical_to_physical_map is not None - assert self.eplb_state.logical_replica_count is not None - assert self.eplb_state.should_record_tensor is not None + if self.eplb_manager is not None: + eplb_state = self.eplb_manager.state + assert eplb_state.expert_load_view is not None + assert eplb_state.logical_to_physical_map is not None + assert eplb_state.logical_replica_count is not None + assert eplb_state.should_record_tensor is not None return eplb_map_to_physical_and_record( topk_ids=topk_ids, - logical_to_physical_map=self.eplb_state.logical_to_physical_map, - logical_replica_count=self.eplb_state.logical_replica_count, - expert_load_view=self.eplb_state.expert_load_view, - record_enabled=self.eplb_state.should_record_tensor, + logical_to_physical_map=eplb_state.logical_to_physical_map, + logical_replica_count=eplb_state.logical_replica_count, + expert_load_view=eplb_state.expert_load_view, + record_enabled=eplb_state.should_record_tensor, ) return topk_ids diff --git a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py index c1bd7a6993ab..5be080242d10 100644 --- a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py +++ b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py @@ -1,13 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable +from typing import TYPE_CHECKING import torch -from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.fused_moe.config import RoutingMethodType from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter +if TYPE_CHECKING: + from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager + class CustomRoutingRouter(BaseRouter): """Router using a custom user-provided routing function.""" @@ -16,17 +19,15 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_state: EplbLayerState, custom_routing_function: Callable, + eplb_manager: EplbManager | None = None, renormalize: bool = True, - enable_eplb: bool = False, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, - enable_eplb=enable_eplb, + eplb_manager=eplb_manager, indices_type_getter=indices_type_getter, ) self.custom_routing_function = custom_routing_function diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py index 84eaad7f65e6..35ef95a85199 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools from collections.abc import Callable +from typing import TYPE_CHECKING import torch import torch.nn.functional as F @@ -9,13 +10,15 @@ import vllm._custom_ops as ops import vllm.envs as envs from vllm._aiter_ops import rocm_aiter_ops -from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.fused_moe.config import ( RoutingMethodType, get_routing_method_type, ) from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter +if TYPE_CHECKING: + from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager + def vllm_topk_softmax( topk_weights: torch.Tensor, @@ -235,11 +238,10 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_state: EplbLayerState, e_score_correction_bias: torch.Tensor | None = None, renormalize: bool = True, routed_scaling_factor: float = 1.0, - enable_eplb: bool = False, + eplb_manager: EplbManager | None = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, *, scoring_func: str = "sigmoid", @@ -248,8 +250,7 @@ def __init__( super().__init__( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, - enable_eplb=enable_eplb, + eplb_manager=eplb_manager, indices_type_getter=indices_type_getter, ) self.e_score_correction_bias = e_score_correction_bias diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py index 45311dba08e3..7c540bdd46ad 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py @@ -1,18 +1,21 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable +from typing import TYPE_CHECKING import torch import vllm._custom_ops as ops from vllm._aiter_ops import rocm_aiter_ops -from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.fused_moe.config import ( RoutingMethodType, get_routing_method_type, ) from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter +if TYPE_CHECKING: + from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager + def vllm_topk_softmax( topk_weights: torch.Tensor, @@ -120,17 +123,15 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_state: EplbLayerState, scoring_func: str = "softmax", renormalize: bool = True, - enable_eplb: bool = False, + eplb_manager: EplbManager | None = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, - enable_eplb=enable_eplb, + eplb_manager=eplb_manager, indices_type_getter=indices_type_getter, ) self.renormalize = renormalize diff --git a/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py b/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py index 74c3a62a1f11..b622f3bc7f57 100644 --- a/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py +++ b/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py @@ -2,13 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable from functools import partial +from typing import TYPE_CHECKING import torch from vllm import _custom_ops as ops from vllm import envs as envs from vllm._aiter_ops import rocm_aiter_ops -from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.fused_moe.config import ( RoutingMethodType, @@ -25,6 +25,9 @@ from vllm.model_executor.utils import maybe_disable_graph_partition from vllm.platforms import current_platform +if TYPE_CHECKING: + from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager + def fused_grouped_topk( hidden_states: torch.Tensor, @@ -251,7 +254,6 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_state: EplbLayerState, num_expert_group: int, topk_group: int, renormalize: bool = True, @@ -259,14 +261,13 @@ def __init__( routed_scaling_factor: float = 1.0, e_score_correction_bias: torch.Tensor | None = None, num_fused_shared_experts: int = 0, - enable_eplb: bool = False, + eplb_manager: EplbManager | None = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, - enable_eplb=enable_eplb, + eplb_manager=eplb_manager, indices_type_getter=indices_type_getter, ) self.num_expert_group = num_expert_group diff --git a/vllm/model_executor/layers/fused_moe/router/router_factory.py b/vllm/model_executor/layers/fused_moe/router/router_factory.py index da7896de6159..718f734ac43a 100644 --- a/vllm/model_executor/layers/fused_moe/router/router_factory.py +++ b/vllm/model_executor/layers/fused_moe/router/router_factory.py @@ -1,12 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable +from typing import TYPE_CHECKING import torch import vllm.envs as envs -from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.fused_moe.config import RoutingMethodType + +if TYPE_CHECKING: + from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager from vllm.model_executor.layers.fused_moe.router.custom_routing_router import ( CustomRoutingRouter, ) @@ -29,8 +32,6 @@ ZeroExpertRouter, ) -EMPTY_EPLB_STATE: EplbLayerState = EplbLayerState() - def create_fused_moe_router( # common parameters @@ -50,8 +51,7 @@ def create_fused_moe_router( # custom routing parameters custom_routing_function: Callable | None = None, # eplb parameters - enable_eplb: bool = False, - eplb_state: EplbLayerState = EMPTY_EPLB_STATE, + eplb_manager: EplbManager | None = None, # zero expert parameters zero_expert_type: str | None = None, num_logical_experts: int | None = None, @@ -91,8 +91,7 @@ def create_fused_moe_router( custom_routing_function: Optional custom routing function EPLB arguments: - enable_eplb: Whether EPLB is enabled - eplb_state: EPLB (Expert Parallelism Load Balancing) state + eplb_manager: Optional EPLB (Expert Parallelism Load Balancing) manager Zero expert arguments: zero_expert_type: Type of zero expert (e.g. identity). If not None, @@ -112,8 +111,7 @@ def create_fused_moe_router( return RoutingSimulatorRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, - enable_eplb=enable_eplb, + eplb_manager=eplb_manager, indices_type_getter=indices_type_getter, ) @@ -127,14 +125,13 @@ def create_fused_moe_router( return ZeroExpertRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, + eplb_manager=eplb_manager, e_score_correction_bias=e_score_correction_bias, num_logical_experts=num_logical_experts, zero_expert_type=zero_expert_type, scoring_func=scoring_func, renormalize=renormalize, routed_scaling_factor=routed_scaling_factor, - enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, ) @@ -148,7 +145,7 @@ def create_fused_moe_router( grouped_topk_router = GroupedTopKRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, + eplb_manager=eplb_manager, num_expert_group=num_expert_group, topk_group=topk_group, renormalize=renormalize, @@ -156,7 +153,6 @@ def create_fused_moe_router( routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, num_fused_shared_experts=num_fused_shared_experts, - enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, ) if ( @@ -176,10 +172,9 @@ def create_fused_moe_router( return CustomRoutingRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, + eplb_manager=eplb_manager, custom_routing_function=custom_routing_function, renormalize=renormalize, - enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, ) @@ -189,11 +184,10 @@ def create_fused_moe_router( return FusedTopKBiasRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, + eplb_manager=eplb_manager, e_score_correction_bias=e_score_correction_bias, renormalize=renormalize, routed_scaling_factor=routed_scaling_factor, - enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, scoring_func=scoring_func, hash_indices_table=hash_indices_table, @@ -202,9 +196,8 @@ def create_fused_moe_router( return FusedTopKRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, + eplb_manager=eplb_manager, renormalize=renormalize, scoring_func=scoring_func, - enable_eplb=enable_eplb, indices_type_getter=indices_type_getter, ) diff --git a/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py b/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py index 8fb36b72cb70..2db45f581634 100644 --- a/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py +++ b/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py @@ -2,16 +2,19 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from collections.abc import Callable -from typing import Any +from typing import TYPE_CHECKING, Any import torch import vllm.envs as envs -from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import RoutingMethodType from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter +if TYPE_CHECKING: + from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager + + logger = init_logger(__name__) @@ -313,15 +316,13 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_state: EplbLayerState, - enable_eplb: bool = False, + eplb_manager: EplbManager | None = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, - enable_eplb=enable_eplb, + eplb_manager=eplb_manager, indices_type_getter=indices_type_getter, ) diff --git a/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py b/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py index 65760727770a..d8057e4300c3 100644 --- a/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py +++ b/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py @@ -2,10 +2,10 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable +from typing import TYPE_CHECKING import torch -from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.fused_moe.config import ( RoutingMethodType, get_routing_method_type, @@ -18,6 +18,9 @@ fused_topk_bias, ) +if TYPE_CHECKING: + from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager + class ZeroExpertRouter(BaseRouter): """Router that handles zero expert computation as part of routing. @@ -32,21 +35,19 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_state: EplbLayerState, e_score_correction_bias: torch.Tensor, num_logical_experts: int, zero_expert_type: str, scoring_func: str = "softmax", renormalize: bool = False, routed_scaling_factor: float = 1.0, - enable_eplb: bool = False, + eplb_manager: EplbManager | None = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, - eplb_state=eplb_state, - enable_eplb=enable_eplb, + eplb_manager=eplb_manager, indices_type_getter=indices_type_getter, ) self.e_score_correction_bias = e_score_correction_bias diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_fp8.py index b14571fe5013..efa28ac3b6ae 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_fp8.py @@ -309,10 +309,6 @@ def apply( topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor: - if layer.enable_eplb: - raise NotImplementedError( - "EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet." - ) assert self.moe_quant_config is not None from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import ( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_int8.py index 88cdbadd3f83..c697b137420b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_int8.py @@ -307,7 +307,6 @@ def apply_monolithic( router_logits: torch.Tensor, input_ids: torch.Tensor | None = None, ) -> torch.Tensor: - assert not layer.enable_eplb, "EPLB not supported for W4A8-int MoE yet." assert layer.activation in ( MoEActivation.SILU, MoEActivation.SWIGLUOAI, From 2332fd70bf3292798a47477ee9e929602da3caa4 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 27 Apr 2026 22:18:07 +0000 Subject: [PATCH 121/191] fix num_local_expert update Signed-off-by: Bill Nell --- .../model_executor/layers/fused_moe/expert_map_manager.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 7438c8330c85..ae788d81c77e 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -262,7 +262,6 @@ def _init_aiter_shared_experts_topK_buffer( max_num_tokens=max_num_batched_tokens * dp_size, is_EP=self.use_ep, ) - self._local_num_experts += self.num_fused_shared_experts @property def use_ep(self) -> int: @@ -391,6 +390,7 @@ def update( self._placement_strategy = self._determine_placement_strategy( self._placement_strategy ) + self._calculate_expert_maps() self._maybe_init_routing_tables() @@ -469,6 +469,8 @@ def _calculate_expert_maps(self) -> None: return_expert_mask=self.rocm_aiter_enabled, ) + self._local_num_experts += self.num_fused_shared_experts + # Move to device if specified if self.device is not None: if self._expert_map is not None: @@ -505,6 +507,10 @@ def _ensure_round_robin_expert_routing_tables( self, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """Build routing tables for round-robin placement.""" + assert self.num_fused_shared_experts == 0, ( + "Round robin not supported for AITER." + ) + device_kwargs = {"device": self.device} if self.device is not None else {} global_indices = torch.arange( From 34988207a9faab704fd0ffa9316b8dcd6f8ec982 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 28 Apr 2026 19:52:00 +0000 Subject: [PATCH 122/191] fix Signed-off-by: Bill Nell --- tests/kernels/moe/test_moe_layer.py | 2 -- vllm/model_executor/layers/fused_moe/router/base_router.py | 5 +---- .../layers/fused_moe/router/custom_routing_router.py | 2 +- .../layers/fused_moe/router/fused_topk_bias_router.py | 2 +- .../layers/fused_moe/router/fused_topk_router.py | 2 +- .../layers/fused_moe/router/grouped_topk_router.py | 2 +- .../model_executor/layers/fused_moe/router/router_factory.py | 5 +---- .../layers/fused_moe/router/routing_simulator_router.py | 2 +- .../layers/fused_moe/router/zero_expert_router.py | 2 +- 9 files changed, 8 insertions(+), 16 deletions(-) diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index 89e28d950f9d..243ace519fb8 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -1004,7 +1004,6 @@ def make_fake_moe_layer( activation: str = "silu", indices_type: torch.dtype | None = None, expert_map: torch.Tensor | None = None, - enable_eplb: bool = False, expert_load_view: torch.Tensor | None = None, logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, @@ -1032,7 +1031,6 @@ def make_fake_moe_layer( routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, num_fused_shared_experts=0, # TODO - enable_eplb=enable_eplb, # TODO(bnell): once we can construct the MK at init time, we # can make this a value. indices_type_getter=lambda: indices_type, diff --git a/vllm/model_executor/layers/fused_moe/router/base_router.py b/vllm/model_executor/layers/fused_moe/router/base_router.py index 18f19a6eacf1..e32816b395b9 100644 --- a/vllm/model_executor/layers/fused_moe/router/base_router.py +++ b/vllm/model_executor/layers/fused_moe/router/base_router.py @@ -2,19 +2,16 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import abstractmethod from collections.abc import Callable -from typing import TYPE_CHECKING import torch +from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( FusedMoERouter, ) from vllm.platforms import current_platform from vllm.triton_utils import tl, triton -if TYPE_CHECKING: - from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager - if current_platform.is_cuda_alike(): @triton.jit diff --git a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py index 5be080242d10..41385c940040 100644 --- a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py +++ b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py @@ -20,7 +20,7 @@ def __init__( top_k: int, global_num_experts: int, custom_routing_function: Callable, - eplb_manager: EplbManager | None = None, + eplb_manager: "EplbManager | None" = None, renormalize: bool = True, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py index 35ef95a85199..6d3bd6ac5529 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py @@ -241,7 +241,7 @@ def __init__( e_score_correction_bias: torch.Tensor | None = None, renormalize: bool = True, routed_scaling_factor: float = 1.0, - eplb_manager: EplbManager | None = None, + eplb_manager: "EplbManager | None" = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, *, scoring_func: str = "sigmoid", diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py index 7c540bdd46ad..d88786491d7b 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py @@ -125,7 +125,7 @@ def __init__( global_num_experts: int, scoring_func: str = "softmax", renormalize: bool = True, - eplb_manager: EplbManager | None = None, + eplb_manager: "EplbManager | None" = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( diff --git a/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py b/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py index b622f3bc7f57..461c5c351f05 100644 --- a/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py +++ b/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py @@ -261,7 +261,7 @@ def __init__( routed_scaling_factor: float = 1.0, e_score_correction_bias: torch.Tensor | None = None, num_fused_shared_experts: int = 0, - eplb_manager: EplbManager | None = None, + eplb_manager: "EplbManager | None" = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( diff --git a/vllm/model_executor/layers/fused_moe/router/router_factory.py b/vllm/model_executor/layers/fused_moe/router/router_factory.py index 718f734ac43a..89592830b23b 100644 --- a/vllm/model_executor/layers/fused_moe/router/router_factory.py +++ b/vllm/model_executor/layers/fused_moe/router/router_factory.py @@ -1,15 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable -from typing import TYPE_CHECKING import torch import vllm.envs as envs from vllm.model_executor.layers.fused_moe.config import RoutingMethodType - -if TYPE_CHECKING: - from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager +from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager from vllm.model_executor.layers.fused_moe.router.custom_routing_router import ( CustomRoutingRouter, ) diff --git a/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py b/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py index 2db45f581634..7d0b8ba8b61a 100644 --- a/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py +++ b/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py @@ -316,7 +316,7 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_manager: EplbManager | None = None, + eplb_manager: "EplbManager | None" = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( diff --git a/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py b/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py index d8057e4300c3..d61056026c01 100644 --- a/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py +++ b/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py @@ -41,7 +41,7 @@ def __init__( scoring_func: str = "softmax", renormalize: bool = False, routed_scaling_factor: float = 1.0, - eplb_manager: EplbManager | None = None, + eplb_manager: "EplbManager | None" = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( From 692912ed475ce91a3f080419d7662418ec5def2f Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 28 Apr 2026 20:04:53 +0000 Subject: [PATCH 123/191] fix Signed-off-by: Bill Nell --- tests/kernels/moe/test_moe_layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index 243ace519fb8..b79fa925c2cb 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -1254,7 +1254,7 @@ def _test_body_eplb( ), ) - eplb_moe_layer.eplb_state.should_record_tensor = torch.ones( + eplb_moe_layer.eplb_manager.state.should_record_tensor = torch.ones( (), dtype=torch.bool, device=device ) From 7dba7b92fa0a9e4da25f77d53c84cde98ceff36b Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 28 Apr 2026 20:18:39 +0000 Subject: [PATCH 124/191] fix merge Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 528d42a77a0f..d4b3fe803d12 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -248,7 +248,7 @@ def __init__( num_expert_group=num_expert_group, moe_parallel_config=self.moe_parallel_config, placement_strategy=self.expert_placement_strategy, - enable_eplb=self.enable_eplb, + enable_eplb=enable_eplb, num_fused_shared_experts=self.num_fused_shared_experts, rocm_aiter_enabled=self.rocm_aiter_fmoe_enabled, device=None, From aa210d3f3d8625d5e7a5ba0a5c9ceba9807ccaa3 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 28 Apr 2026 22:33:56 +0000 Subject: [PATCH 125/191] fix Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/expert_map_manager.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index ae788d81c77e..ba964e1ab7b7 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -135,8 +135,7 @@ def determine_expert_placement_strategy( return "linear" if ( moe_parallel_config.use_all2all_kernels - and not moe_parallel_config.use_deepep_ll_kernels - and not moe_parallel_config.use_nixl_ep_kernels + and not moe_parallel_config.needs_round_robin_routing_tables ): logger.warning( "Round-robin expert placement currently only supports " @@ -435,8 +434,7 @@ def _determine_placement_strategy( if ( self.moe_parallel_config.use_all2all_kernels - and not self.moe_parallel_config.use_deepep_ll_kernels - and not self.moe_parallel_config.use_nixl_ep_kernels + and not self.moe_parallel_config.needs_round_robin_routing_tables ): logger.warning( "Round-robin placement requires DeepEP-ll or NIXL backend. " From 796a38416028e67d9e266e593a0f737f4ff84656 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 29 Apr 2026 02:22:26 +0000 Subject: [PATCH 126/191] remove debug print Signed-off-by: Bill Nell --- vllm/utils/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index f556610e46d7..e8287b0cd114 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -46,6 +46,4 @@ def _check_bases(cls): if _check_bases(b): return True - res = _check_bases(module.__class__) - print(f"IS_MOE_LAYER[{module.__class__}] = {res}") - return res + return _check_bases(module.__class__) From c74f2856175c6bc99e645dfa711ac34f243b558d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 29 Apr 2026 19:53:58 +0000 Subject: [PATCH 127/191] try to fix doc Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/expert_map_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index ba964e1ab7b7..0e204f4d1fb8 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -378,7 +378,7 @@ def update( dp_size: New DP size (if changed, for AITER buffer reinitialization) top_k: New top_k (if changed, for AITER buffer reinitialization) max_num_batched_tokens: New max batched tokens (if changed, for AITER - buffer reinitialization) + buffer reinitialization) """ if new_ep_size is not None: self.moe_parallel_config.ep_size = new_ep_size From 4791779ac8d0208014b8176b7b14f9149864bf79 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 30 Apr 2026 17:30:31 +0000 Subject: [PATCH 128/191] some fixes Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/oracle/fp8.py | 3 ++- vllm/model_executor/layers/fused_moe/oracle/nvfp4.py | 3 ++- .../model_executor/layers/fused_moe/runner/moe_runner.py | 9 ++------- .../layers/quantization/utils/marlin_utils.py | 2 +- .../layers/quantization/utils/marlin_utils_fp4.py | 9 +++++---- vllm/model_executor/models/deepseek_v4.py | 7 +++++-- vllm/model_executor/warmup/deep_gemm_warmup.py | 2 +- 7 files changed, 18 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py index cb687b10536a..7ff33825a2fe 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py +++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py @@ -18,6 +18,7 @@ fp8_w8a8_moe_quant_config, fp8_w8a16_moe_quant_config, ) +from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( FlashinferMoeBackend, get_flashinfer_moe_backend, @@ -407,7 +408,7 @@ def _return_or_raise( def convert_to_fp8_moe_kernel_format( fp8_backend: Fp8MoeBackend, - layer: torch.nn.Module, + layer: RoutedExperts, w13: torch.Tensor, w2: torch.Tensor, w13_scale: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py index 39d404a3d4a5..e97c0571dcec 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py +++ b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py @@ -17,6 +17,7 @@ nvfp4_moe_quant_config, nvfp4_w4a16_moe_quant_config, ) +from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( prepare_nvfp4_moe_layer_for_fi_or_cutlass, prepare_nvfp4_moe_layer_for_flashinfer_cutedsl, @@ -285,7 +286,7 @@ def _return_or_raise( def convert_to_nvfp4_moe_kernel_format( nvfp4_backend: NvFp4MoeBackend, - layer: torch.nn.Module, + layer: RoutedExperts, w13: torch.Tensor, w13_scale: torch.Tensor, w13_scale_2: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index b9d8452f5ef1..6f108d6fd33f 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -386,7 +386,7 @@ def _maybe_reduce_final_output( and (self.moe_config.tp_size > 1 or self.moe_config.ep_size > 1) and not self._fused_output_is_reduced ): - states = tensor_model_parallel_all_reduce(states) + states = tensor_model_parallel_all_reduce(states.contiguous()) return states @@ -575,12 +575,11 @@ def forward( # `moe_config.hidden_dim`, e.g. after `align_trtllm_fp4_moe_hidden_dim_for_fi` # so routed output can be trimmed before # shared+routed add / latent up proj if needed. - routed_hidden_dim = hidden_states.shape[-1] + hidden_states, og_hidden_dim = self._maybe_pad_hidden_states( shared_experts_input, hidden_states, ) - hidden_dim_was_padded = hidden_states.shape[-1] > routed_hidden_dim result = self._forward_entry( hidden_states, @@ -601,10 +600,6 @@ def forward( # Extract outputs from result shared_output, fused_output = _unpack(result) - if ( - shared_output is not None or self.routed_output_transform is not None - ) and hidden_dim_was_padded: - fused_output = fused_output[..., :routed_hidden_dim] # Remember 40794. Double check tests/lora/test_gpt_oss.py::test_gpt_oss_tp2 fused_output = fused_output[:, :og_hidden_dim] diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 39e1083a81dd..bd6b86f7ab52 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -8,7 +8,7 @@ import vllm.envs as envs from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe import RoutedExperts +from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts from vllm.model_executor.layers.linear import LinearBase from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.utils.int8_utils import ( diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py index c02d39c17a02..f1f2e3b27e23 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py @@ -6,6 +6,7 @@ import vllm._custom_ops as ops from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts from vllm.model_executor.layers.quantization.utils.marlin_utils import ( USE_FP32_REDUCE_DEFAULT, get_marlin_input_dtype, @@ -286,7 +287,7 @@ def prepare_fp4_layer_for_marlin( def prepare_nvfp4_moe_layer_for_marlin( - layer: torch.nn.Module, + layer: RoutedExperts, w13: torch.Tensor, w13_scale: torch.Tensor, w13_scale_2: torch.Tensor, @@ -353,7 +354,7 @@ def repack_weight(weight: torch.Tensor, name: str) -> torch.Tensor: # WEIGHT SCALES # Permute scales - def premute_scales( + def permute_scales( scales: torch.Tensor, g_scales: torch.Tensor, name: str ) -> tuple[torch.Tensor, torch.Tensor]: scales = scales.to(param_dtype) @@ -388,8 +389,8 @@ def premute_scales( g_scales = g_scales / combined_scale_factor return scales, g_scales - w13_scale, w13_scale_2 = premute_scales(w13_scale, w13_scale_2, "w13") - w2_scale, w2_scale_2 = premute_scales(w2_scale, w2_scale_2, "w2") + w13_scale, w13_scale_2 = permute_scales(w13_scale, w13_scale_2, "w13") + w2_scale, w2_scale_2 = permute_scales(w2_scale, w2_scale_2, "w2") return w13, w13_scale, w13_scale_2, w2, w2_scale, w2_scale_2 diff --git a/vllm/model_executor/models/deepseek_v4.py b/vllm/model_executor/models/deepseek_v4.py index d6edf0789f57..722c15fc5166 100644 --- a/vllm/model_executor/models/deepseek_v4.py +++ b/vllm/model_executor/models/deepseek_v4.py @@ -23,8 +23,11 @@ DeepseekV4MLAModules, DeepseekV4MultiHeadLatentAttentionWrapper, ) -from vllm.model_executor.layers.fused_moe import FusedMoE, GateLinear -from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + GateLinear, + UnquantizedFusedMoEMethod, +) from vllm.model_executor.layers.fused_moe.router.fused_topk_bias_router import ( fused_topk_bias, ) diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index 7d9af774e810..3a337907d5aa 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -170,7 +170,7 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool: ): return False - moe_kernel = getattr(module.quant_method, "moe_kernel", None) + moe_kernel = getattr(quant_method, "moe_kernel", None) if moe_kernel is None: return False From c8a42c29611b1611e23eae30b39675f15a1d0892 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 1 May 2026 17:35:08 +0000 Subject: [PATCH 129/191] fixes Signed-off-by: Bill Nell --- .../layers/fused_moe/routed_experts.py | 3 + .../layers/fused_moe/runner/moe_runner.py | 5 + .../model_loader/weight_utils.py | 107 +++++++++++++++++- vllm/model_executor/models/gpt_oss.py | 5 +- 4 files changed, 116 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index 58a50321e4bd..03de14aed6f9 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -934,6 +934,7 @@ def forward( router_logits: torch.Tensor | None = None, shared_experts: torch.nn.Module | None = None, # SharedExperts shared_experts_input: torch.Tensor | None = None, + input_ids: torch.Tensor | None = None, ) -> torch.Tensor: """ Execute routed experts using the quantization method's apply function. @@ -963,6 +964,7 @@ def forward( layer=self, # Pass RoutedExperts as layer x=x, router_logits=router_logits, + input_ids=input_ids, ) else: # Modular kernels use pre-computed routing @@ -973,6 +975,7 @@ def forward( topk_ids=topk_ids, shared_experts=shared_experts, shared_experts_input=shared_experts_input, + input_ids=input_ids, ) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index 6f108d6fd33f..2b17e79aeae1 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -809,6 +809,11 @@ def activation(self) -> MoEActivation: # Expert maps # + @property + def expert_map_manager(self): + """Forward to routed_experts.expert_map_manager for backward compatibility.""" + return self.routed_experts.expert_map_manager + @property def expert_placement_strategy(self) -> ExpertPlacementStrategy: return self.expert_map_manager.placement_strategy diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 31b00df4e4c3..ce6e50f2be3b 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -13,7 +13,7 @@ import threading import time from collections import defaultdict -from collections.abc import Callable, Generator +from collections.abc import Callable, Generator, Iterable from contextlib import contextmanager from pathlib import Path from typing import IO, Any @@ -1605,3 +1605,108 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None: # If there were no matches, return the untouched param name return name + + +def maybe_remap_moe_expert_param_name( + name: str, + params_dict: dict[str, torch.nn.Parameter], +) -> str: + """ + Remap MoE expert parameter names to account for routed_experts hierarchy. + + This handles the transition from the old FusedMoE structure where weights + were directly in the experts module, to the new MoERunner → RoutedExperts + structure. + + Checkpoint weights have names like: + layers.0.mlp.experts.w13_weight + But actual parameters are now: + layers.0.mlp.experts.routed_experts.w13_weight + + This function inserts 'routed_experts.' into the path when needed. + + Args: + name: Parameter name from checkpoint + params_dict: Dictionary of model parameters (from named_parameters()) + + Returns: + Remapped parameter name if routed_experts hierarchy exists, + otherwise the original name + """ + # Only remap if this looks like an expert parameter + if ".mlp.experts." not in name: + return name + + # Skip if already has routed_experts + if ".mlp.experts.routed_experts." in name: + return name + + # Expert parameter patterns to check + expert_param_suffixes = [ + "w13_weight", + "w2_weight", + "w13_weight_scale", + "w2_weight_scale", + "w13_input_scale", + "w2_input_scale", + "w13_bias", + "w2_bias", + "w13_scale", + "w2_scale", + "w13_g_idx", + "w2_g_idx", + "w13_qweight", + "w2_qweight", + "w13_qzeros", + "w2_qzeros", + "w13_weight_shape", + "w2_weight_shape", + ] + + # Check if this is an expert weight parameter + is_expert_param = any( + f".{suffix}" in name or name.endswith(suffix) + for suffix in expert_param_suffixes + ) + + if not is_expert_param: + return name + + # Try inserting routed_experts + new_name = name.replace(".mlp.experts.", ".mlp.experts.routed_experts.") + + # Only use the new name if it exists in the model + if new_name in params_dict: + return new_name + + # Otherwise return original name (old checkpoint format or different structure) + return name + + +def remap_moe_expert_weights( + weights: Iterable[tuple[str, torch.Tensor]], + params_dict: dict[str, torch.nn.Parameter], +) -> Generator[tuple[str, torch.Tensor], None, None]: + """ + Wrapper generator that remaps MoE expert parameter names for backward compatibility. + + This allows models with custom weight loading to automatically handle both old + and new checkpoint formats without needing model-specific remapping code. + + Usage: + params_dict = dict(model.named_parameters()) + for name, weight in remap_moe_expert_weights(weights, params_dict): + # name is automatically remapped if needed + param = params_dict[name] + ... + + Args: + weights: Iterator of (name, tensor) tuples from checkpoint + params_dict: Dictionary of model parameters (from named_parameters()) + + Yields: + (remapped_name, tensor) tuples + """ + for name, weight in weights: + remapped_name = maybe_remap_moe_expert_param_name(name, params_dict) + yield (remapped_name, weight) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 847b05173c68..522e2d1d739d 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -580,9 +580,8 @@ def _get_moe_weight_dtype(layer_id: int = 0) -> str | None: Returns: Weight dtype string (e.g., "mxfp4", "fp8") or None if not available """ - # XXXXXXXXXXXXXXXXXXX - if hasattr(self.layers[layer_id].mlp.experts.quant_method, "weight_dtype"): - return self.layers[layer_id].mlp.experts.quant_method.weight_dtype + if hasattr(self.layers[layer_id].mlp.experts._quant_method, "weight_dtype"): + return self.layers[layer_id].mlp.experts._quant_method.weight_dtype return None intermediate_size = self.config.intermediate_size From 522a8fcbec5eb18dc621dab988b5255d165428c7 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 1 May 2026 20:47:28 +0000 Subject: [PATCH 130/191] weight loading fixes Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/routed_experts.py | 1 - vllm/model_executor/models/gemma4.py | 10 +++++++--- vllm/model_executor/models/gpt_oss.py | 6 ++++-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index 03de14aed6f9..ce8234862bb9 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -975,7 +975,6 @@ def forward( topk_ids=topk_ids, shared_experts=shared_experts, shared_experts_input=shared_experts_input, - input_ids=input_ids, ) diff --git a/vllm/model_executor/models/gemma4.py b/vllm/model_executor/models/gemma4.py index bb91fd601e70..5bac11937b16 100644 --- a/vllm/model_executor/models/gemma4.py +++ b/vllm/model_executor/models/gemma4.py @@ -1387,10 +1387,10 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: expert_params_mapping = [ # (param_name, weight_name, expert_id, shard_id) ( - "experts.w13_" + "moe.experts.routed_experts.w13_" if proj_name in ["gate_proj", "up_proj"] - else "experts.w2_", - f"experts.{expert_id}.{proj_name}.", + else "moe.experts.routed_experts.w2_", + f"moe.experts.{expert_id}.{proj_name}.", expert_id, shard_id, ) @@ -1493,6 +1493,10 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: continue if is_pp_missing_parameter(name, self): continue + # Skip if name doesn't exist in params_dict (e.g., individual + # expert weights that should have been handled above) + if name not in params_dict: + continue param = params_dict[name] weight_loader = getattr( param, "weight_loader", default_weight_loader diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 522e2d1d739d..0c726ae6325a 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -43,6 +43,7 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name, + remap_moe_expert_weights, ) from vllm.model_executor.models.utils import sequence_parallel_chunk from vllm.platforms import current_platform @@ -379,7 +380,8 @@ def _load_weights_mxfp4( tp_rank_start = tp_rank * per_rank_intermediate_size tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, intermediate_size) - for name, weight in weights: + # Use centralized weight remapping for MoE expert parameters (Solution 7) + for name, weight in remap_moe_expert_weights(weights, params_dict): # Skip layers on other devices. if is_pp_missing_parameter(name, self): continue @@ -682,7 +684,7 @@ def kv_cache_scale_loader( continue if ( - all(key in name for key in ["input_scale", "mlp.experts"]) # XXXXX + all(key in name for key in ["input_scale", "mlp.experts"]) and expert_id is not None ): assert loaded_weight.numel() == 1 From 1b3841751a3517e0b014674da99cf585b595c5dc Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 1 May 2026 22:41:42 +0000 Subject: [PATCH 131/191] more weight loading fixes Signed-off-by: Bill Nell --- vllm/model_executor/models/ernie45_vl_moe.py | 6 +++++- vllm/model_executor/models/granitemoe.py | 6 +++--- vllm/model_executor/models/granitemoehybrid.py | 8 +++++--- vllm/model_executor/models/granitemoeshared.py | 6 +++--- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 93c9316974b7..2cdaca661051 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -660,6 +660,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: num_experts=max(self.config.moe_num_experts), ) + # print(f"EPM {expert_params_mapping}") + params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() for name, loaded_weight in weights: @@ -693,7 +695,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: break else: # Distinguish between vision experts and text experts - if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXX + if "mlp.experts" in name: moe_offset = int(name.split(".")[-3]) vision_expert_start_idx = self.config.moe_num_experts[0] is_text_expert = moe_offset <= vision_expert_start_idx - 1 @@ -708,6 +710,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: for mapping in expert_params_mapping: param_name, weight_name, expert_id, shard_id = mapping + # print(f"MATCH {weight_name, name}") + if weight_name not in name: continue diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index e3585a6dd746..0f2db302e246 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -451,11 +451,11 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: for e in range(p.size(0)): w1_name = n.replace( ".block_sparse_moe.input_linear.weight", - f".block_sparse_moe.experts.{e}.w1.weight", + f".block_sparse_moe.experts.routed_experts.{e}.w1.weight", ) w3_name = n.replace( ".block_sparse_moe.input_linear.weight", - f".block_sparse_moe.experts.{e}.w3.weight", + f".block_sparse_moe.experts.routed_experts.{e}.w3.weight", ) w1_param, w3_param = p[e].chunk(2, dim=0) assert w1_name not in new_weights @@ -466,7 +466,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: for e in range(p.size(0)): w2_name = n.replace( ".block_sparse_moe.output_linear.weight", - f".block_sparse_moe.experts.{e}.w2.weight", + f".block_sparse_moe.experts.routed_experts.{e}.w2.weight", ) w2_param = p[e] assert w2_name not in new_weights diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 1ab069e3ba38..81298b8b2455 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -455,6 +455,8 @@ def _load_shard(n, p, shard_id): loaded_params.add(n) def _load_expert(n, p, name, shard_id, expert_id): + if n not in params_dict: + return param = params_dict[n] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, p, name, shard_id=shard_id, expert_id=expert_id) @@ -530,14 +532,14 @@ def _load_quant_expert(name, loaded_weight): ) w1_param, w3_param = p[e].chunk(2, dim=0) _load_expert( - n.replace(".input_linear.", ".experts.w13_"), + n.replace(".input_linear.", ".experts.routed_experts.w13_"), w1_param, w1_name, shard_id="w1", expert_id=e, ) _load_expert( - n.replace(".input_linear.", ".experts.w13_"), + n.replace(".input_linear.", ".experts.routed_experts.w13_"), w3_param, w3_name, shard_id="w3", @@ -553,7 +555,7 @@ def _load_quant_expert(name, loaded_weight): ) w2_param = p[e] _load_expert( - n.replace(".output_linear.", ".experts.w2_"), + n.replace(".output_linear.", ".experts.routed_experts.w2_"), w2_param, w2_name, shard_id="w2", diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index 7abc682c58e5..7c8a92b88dda 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -214,11 +214,11 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: for e in range(p.size(0)): w1_name = n.replace( ".block_sparse_moe.input_linear.weight", - f".block_sparse_moe.experts.{e}.w1.weight", + f".block_sparse_moe.experts.routed_experts.{e}.w1.weight", ) w3_name = n.replace( ".block_sparse_moe.input_linear.weight", - f".block_sparse_moe.experts.{e}.w3.weight", + f".block_sparse_moe.experts.routed_experts.{e}.w3.weight", ) w1_param, w3_param = p[e].chunk(2, dim=0) assert w1_name not in new_weights @@ -229,7 +229,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: for e in range(p.size(0)): w2_name = n.replace( ".block_sparse_moe.output_linear.weight", - f".block_sparse_moe.experts.{e}.w2.weight", + f".block_sparse_moe.experts.routed_experts.{e}.w2.weight", ) w2_param = p[e] assert w2_name not in new_weights From 3adc884cc1f2dd18c1e0f8ec0ed3d8674a9a2438 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 4 May 2026 19:01:10 +0000 Subject: [PATCH 132/191] fix Signed-off-by: Bill Nell --- vllm/model_executor/models/laguna.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/laguna.py b/vllm/model_executor/models/laguna.py index 08f35d691817..0efbdcef5995 100644 --- a/vllm/model_executor/models/laguna.py +++ b/vllm/model_executor/models/laguna.py @@ -20,7 +20,10 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -650,7 +653,7 @@ def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: Returns mapping tuples of (param_name, weight_name, expert_id, shard_id) that handle both weights and quantization scales. """ - return FusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", From 13d121ebc7aea9759336833699b1c68885393f39 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 4 May 2026 19:38:42 +0000 Subject: [PATCH 133/191] loader fixes Signed-off-by: Bill Nell --- vllm/model_executor/models/ernie45_vl_moe.py | 11 +++++------ vllm/model_executor/models/granitemoe.py | 6 +++--- vllm/model_executor/models/laguna.py | 7 +++++-- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 2cdaca661051..550adf31607b 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -660,8 +660,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: num_experts=max(self.config.moe_num_experts), ) - # print(f"EPM {expert_params_mapping}") - params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() for name, loaded_weight in weights: @@ -700,18 +698,19 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: vision_expert_start_idx = self.config.moe_num_experts[0] is_text_expert = moe_offset <= vision_expert_start_idx - 1 if is_text_expert: - name = name.replace(".experts.", ".text_experts.") + name = name.replace( + ".experts.", ".text_experts.routed_experts." + ) else: + delta = moe_offset - vision_expert_start_idx name = name.replace( f".experts.{moe_offset}", - f".vision_experts.{moe_offset - vision_expert_start_idx}", + f".vision_experts.routed_experts.{delta}", ) for mapping in expert_params_mapping: param_name, weight_name, expert_id, shard_id = mapping - # print(f"MATCH {weight_name, name}") - if weight_name not in name: continue diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 0f2db302e246..e3585a6dd746 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -451,11 +451,11 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: for e in range(p.size(0)): w1_name = n.replace( ".block_sparse_moe.input_linear.weight", - f".block_sparse_moe.experts.routed_experts.{e}.w1.weight", + f".block_sparse_moe.experts.{e}.w1.weight", ) w3_name = n.replace( ".block_sparse_moe.input_linear.weight", - f".block_sparse_moe.experts.routed_experts.{e}.w3.weight", + f".block_sparse_moe.experts.{e}.w3.weight", ) w1_param, w3_param = p[e].chunk(2, dim=0) assert w1_name not in new_weights @@ -466,7 +466,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: for e in range(p.size(0)): w2_name = n.replace( ".block_sparse_moe.output_linear.weight", - f".block_sparse_moe.experts.routed_experts.{e}.w2.weight", + f".block_sparse_moe.experts.{e}.w2.weight", ) w2_param = p[e] assert w2_name not in new_weights diff --git a/vllm/model_executor/models/laguna.py b/vllm/model_executor/models/laguna.py index 08f35d691817..0efbdcef5995 100644 --- a/vllm/model_executor/models/laguna.py +++ b/vllm/model_executor/models/laguna.py @@ -20,7 +20,10 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + fused_moe_make_expert_params_mapping, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -650,7 +653,7 @@ def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: Returns mapping tuples of (param_name, weight_name, expert_id, shard_id) that handle both weights and quantization scales. """ - return FusedMoE.make_expert_params_mapping( + return fused_moe_make_expert_params_mapping( self, ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", From fa6975958f36539fea978134d33782da36c20195 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 4 May 2026 20:48:44 +0000 Subject: [PATCH 134/191] fix Signed-off-by: Bill Nell --- vllm/model_executor/models/aria.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index adc2000a89fe..16b9b5eac336 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -216,9 +216,9 @@ def forward( return out -# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXx class AriaFusedMoE(torch.nn.Module): def __init__(self, *args, **kwargs): + super().__init__() self.moe = FusedMoE(*args, **kwargs) def forward( @@ -226,6 +226,7 @@ def forward( ) -> torch.Tensor: return self.moe(hidden_states, router_logits) + # TODO(bnell): this probably requires a subclass of RoutedExperts def weight_loader( self, param: nn.Parameter, loaded_weight: torch.Tensor, shard_id: str ) -> None: @@ -234,13 +235,14 @@ def weight_loader( # up weights for each expert. # Note: Loading expert weights with quantization is not supported tp_rank = get_tensor_model_parallel_rank() + tp_size = self.moe.moe_config.tp_size if shard_id == "w13": # the shape of loaded_weight is # (num_experts, hidden_size, 2 * moe_intermediate_size) - if self.tp_size > 1: + if tp_size > 1: up, gate = loaded_weight.chunk(2, dim=-1) - up_current_rank = up.chunk(self.tp_size, dim=-1)[tp_rank] - gate_current_rank = gate.chunk(self.tp_size, dim=-1)[tp_rank] + up_current_rank = up.chunk(tp_size, dim=-1)[tp_rank] + gate_current_rank = gate.chunk(tp_size, dim=-1)[tp_rank] up_and_gate = torch.cat( [up_current_rank, gate_current_rank], dim=-1 ).transpose(1, 2) @@ -250,8 +252,8 @@ def weight_loader( elif shard_id == "w2": # the shape of loaded_weight is # (num_experts, moe_intermediate_size, hidden_size) - if self.tp_size > 1: - down_current_rank = loaded_weight.chunk(self.tp_size, dim=1)[tp_rank] + if tp_size > 1: + down_current_rank = loaded_weight.chunk(tp_size, dim=1)[tp_rank] param.data.copy_(down_current_rank.transpose(1, 2)) else: param.data.copy_(loaded_weight.transpose(1, 2)) @@ -360,8 +362,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: (".qkv_proj", ".v_proj", "v"), (".gate_up_proj", ".gate_proj", 0), (".gate_up_proj", ".up_proj", 1), - ("experts.w13_weight", "experts.fc1.weight", "w13"), - ("experts.w2_weight", "experts.fc2.weight", "w2"), + ("experts.moe.routed_experts.w13_weight", "experts.fc1.weight", "w13"), + ("experts.moe.routed_experts.w2_weight", "experts.fc2.weight", "w2"), ] params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() From 456668ec5fea9277d642095ab47f59398973b6d9 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 4 May 2026 21:35:08 +0000 Subject: [PATCH 135/191] fix Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/oracle/mxfp4.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py index 2354de658ae3..3c11872e600e 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py +++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py @@ -1311,7 +1311,7 @@ def make_mxfp4_moe_kernel( experts_cls: type[mk.FusedMoEExperts], mxfp4_backend: Mxfp4MoeBackend, routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, - layer: "RoutedExperts" | None = None, + layer: "RoutedExperts | None" = None, ) -> mk.FusedMoEKernel: """Create a FusedMoEKernel for the given MXFP4 backend.""" is_monolithic = issubclass(experts_cls, mk.FusedMoEExpertsMonolithic) From 69c62e8c1e45c8fdd2981785d6109746eff841a0 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 4 May 2026 21:36:59 +0000 Subject: [PATCH 136/191] fix merge Signed-off-by: Bill Nell --- .../layers/quantization/utils/humming_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/humming_utils.py b/vllm/model_executor/layers/quantization/utils/humming_utils.py index f8c10bdcae16..2cfc0fc87305 100644 --- a/vllm/model_executor/layers/quantization/utils/humming_utils.py +++ b/vllm/model_executor/layers/quantization/utils/humming_utils.py @@ -12,7 +12,7 @@ FusedMoEQuantConfig, FusedMoEQuantDesc, ) -from vllm.model_executor.layers.fused_moe.layer import FusedMoE +from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts from vllm.model_executor.layers.linear import LinearBase from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape @@ -82,7 +82,7 @@ def prepare_humming_layer(layer: LinearBase, quant_config: dict): HummingMethod.transform_humming_layer(layer) -def prepare_humming_moe_layer(layer: FusedMoE, quant_config: dict): +def prepare_humming_moe_layer(layer: RoutedExperts, quant_config: dict): weight_schema = BaseWeightSchema.from_config(quant_config) input_quant_config = envs.VLLM_HUMMING_INPUT_QUANT_CONFIG or {} if humming_is_layer_skipped(input_quant_config, layer.layer_name): @@ -164,7 +164,7 @@ def prepare_humming_moe_layer(layer: FusedMoE, quant_config: dict): layer.register_buffer("locks", locks) -def get_humming_moe_quant_config(layer: FusedMoE): +def get_humming_moe_quant_config(layer: RoutedExperts): input_schema = layer.input_schemas["w13"] weight_schema = layer.weight_schemas["w13"] From fa64b769418b92559b16ebebb540e7141ee538f5 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 4 May 2026 21:51:46 +0000 Subject: [PATCH 137/191] fix aria Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 5 ++++- vllm/model_executor/models/aria.py | 22 ++++++------------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 431343d4938f..de05cab28a0d 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -150,6 +150,7 @@ def FusedMoE( apply_routed_scale_to_output: bool = False, zero_expert_type: str | None = None, hash_indices_table: torch.Tensor | None = None, + routed_experts_cls: type[RoutedExperts] | None = None, ) -> MoERunner: # TODO update comment """FusedMoE layer builder for MoE models. @@ -305,7 +306,9 @@ def FusedMoE( # Create RoutedExperts instance BEFORE create_weights() # This will hold all expert weight parameters - routed_experts = RoutedExperts( + if routed_experts_cls is None: + routed_experts_cls = RoutedExperts + routed_experts = routed_experts_cls( layer_name, params_dtype, moe_config, diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 16b9b5eac336..c4286f3b2c77 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -16,6 +16,7 @@ from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.fused_moe import ( FusedMoE, + RoutedExperts, ) from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -216,17 +217,7 @@ def forward( return out -class AriaFusedMoE(torch.nn.Module): - def __init__(self, *args, **kwargs): - super().__init__() - self.moe = FusedMoE(*args, **kwargs) - - def forward( - self, hidden_states: torch.Tensor, router_logits: torch.Tensor - ) -> torch.Tensor: - return self.moe(hidden_states, router_logits) - - # TODO(bnell): this probably requires a subclass of RoutedExperts +class AriaRoutedExperts(RoutedExperts): def weight_loader( self, param: nn.Parameter, loaded_weight: torch.Tensor, shard_id: str ) -> None: @@ -235,7 +226,7 @@ def weight_loader( # up weights for each expert. # Note: Loading expert weights with quantization is not supported tp_rank = get_tensor_model_parallel_rank() - tp_size = self.moe.moe_config.tp_size + tp_size = self.moe_config.tp_size if shard_id == "w13": # the shape of loaded_weight is # (num_experts, hidden_size, 2 * moe_intermediate_size) @@ -289,7 +280,7 @@ def __init__( bias=config.mlp_bias, ) - self.experts = AriaFusedMoE( + self.experts = FusedMoE( shared_experts=self.shared_experts, num_experts=config.moe_num_experts, top_k=config.moe_topk, @@ -297,6 +288,7 @@ def __init__( intermediate_size=config.intermediate_size, quant_config=quant_config, prefix=f"{prefix}.experts", + routed_experts_cls=AriaRoutedExperts, ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -362,8 +354,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: (".qkv_proj", ".v_proj", "v"), (".gate_up_proj", ".gate_proj", 0), (".gate_up_proj", ".up_proj", 1), - ("experts.moe.routed_experts.w13_weight", "experts.fc1.weight", "w13"), - ("experts.moe.routed_experts.w2_weight", "experts.fc2.weight", "w2"), + ("experts.routed_experts.w13_weight", "experts.fc1.weight", "w13"), + ("experts.routed_experts.w2_weight", "experts.fc2.weight", "w2"), ] params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() From 5f996a40b2879b2304c92392b5521e46e240878e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 4 May 2026 23:25:49 +0000 Subject: [PATCH 138/191] fix lint Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/oracle/mxfp4.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py index 3c11872e600e..4f3c600abda4 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py +++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from enum import Enum -from typing import TYPE_CHECKING, Union +from typing import Union import torch @@ -11,6 +11,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( FusedMoEConfig, + RoutedExperts, ) from vllm.model_executor.layers.fused_moe.all2all_utils import ( maybe_make_prepare_finalize, @@ -33,10 +34,6 @@ from vllm.utils.import_utils import has_triton_kernels from vllm.utils.math_utils import round_up -if TYPE_CHECKING: - from vllm.model_executor.layers.fused_moe import RoutedExperts - - logger = init_logger(__name__) if has_triton_kernels(): @@ -1285,12 +1282,11 @@ def make_mxfp4_moe_quant_config( gemm1_clamp_limit=swiglu_limit, ) elif mxfp4_backend == Mxfp4MoeBackend.HUMMING: - from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.quantization.utils.humming_utils import ( get_humming_moe_quant_config, ) - assert isinstance(layer, FusedMoE) + assert isinstance(layer, RoutedExperts) return get_humming_moe_quant_config(layer) else: return ocp_mx_moe_quant_config( @@ -1311,7 +1307,7 @@ def make_mxfp4_moe_kernel( experts_cls: type[mk.FusedMoEExperts], mxfp4_backend: Mxfp4MoeBackend, routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, - layer: "RoutedExperts | None" = None, + layer: RoutedExperts | None = None, ) -> mk.FusedMoEKernel: """Create a FusedMoEKernel for the given MXFP4 backend.""" is_monolithic = issubclass(experts_cls, mk.FusedMoEExpertsMonolithic) From 09a18e216bcb5d4f26c94b0c6829ea4baaf2a07d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 5 May 2026 00:24:38 +0000 Subject: [PATCH 139/191] remove debugging code Signed-off-by: Bill Nell --- vllm/utils/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 7040fb52b052..bf455c261f4f 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -46,6 +46,4 @@ def _check_bases(cls): if _check_bases(b): return True - res = _check_bases(module.__class__) - print(f"IS_MOE_LAYER[{module.__class__}] = {res}") - return res + return _check_bases(module.__class__) From eeeff4aac3e95e0d7a07b17218ab8e4ad24526e2 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 5 May 2026 03:14:43 +0000 Subject: [PATCH 140/191] fixes Signed-off-by: Bill Nell --- .../layers/fused_moe/eplb_manager.py | 69 +------------------ .../layers/fused_moe/routed_experts.py | 34 +++++---- vllm/model_executor/models/ernie45_vl_moe.py | 7 +- 3 files changed, 26 insertions(+), 84 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/eplb_manager.py b/vllm/model_executor/layers/fused_moe/eplb_manager.py index 162a7fb35e87..48220e8e5cae 100644 --- a/vllm/model_executor/layers/fused_moe/eplb_manager.py +++ b/vllm/model_executor/layers/fused_moe/eplb_manager.py @@ -13,7 +13,7 @@ import torch -from vllm.distributed.eplb.eplb_state import EplbLayerState, EplbState +from vllm.distributed.eplb.eplb_state import EplbLayerState class EplbManager: @@ -167,70 +167,3 @@ def _maybe_make_contiguous( and weight.shape != torch.Size([]) and not name.startswith("shared_experts._layer") ] - - @staticmethod - def make_expert_params_mapping( - model: torch.nn.Module, - ckpt_gate_proj_name: str, - ckpt_down_proj_name: str, - ckpt_up_proj_name: str, - num_experts: int, - num_redundant_experts: int = 0, - ) -> list[tuple[str, str, int, str]]: - """ - Create expert parameter mapping for weight loading with redundant experts. - - This mapping handles the physical-to-logical expert ID conversion needed - when loading weights with EPLB redundant experts. - - Args: - model: The model containing the MoE layer - ckpt_gate_proj_name: Name of gate projection in checkpoint - ckpt_down_proj_name: Name of down projection in checkpoint - ckpt_up_proj_name: Name of up projection in checkpoint - num_experts: Number of logical (non-redundant) experts - num_redundant_experts: Number of redundant experts - - Returns: - List of tuples (param_name, weight_name, expert_id, shard_id) - where: - - param_name: Parameter name in the layer - - weight_name: Weight name in checkpoint - - expert_id: Physical expert ID - - shard_id: Shard identifier (w1, w2, w3) - """ - num_physical_experts = num_experts + num_redundant_experts - - # In the returned mapping: - # - `expert_id` is the physical expert id - # - `weight_name` contains the weight name of the logical expert - # So that we should map the expert id to logical in `weight_name` - physical_to_logical_map = ( - EplbState.build_initial_global_physical_to_logical_map( - num_experts, num_redundant_experts - ) - ) - - base_layer = ( - "base_layer." - if any(".base_layer." in name for name, _ in model.named_parameters()) - else "" - ) - - return [ - # (param_name, weight_name, expert_id, shard_id) - ( - f"experts.{base_layer}w13_" - if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name] - else f"experts.{base_layer}w2_", - f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}", - expert_id, - shard_id, - ) - for expert_id in range(num_physical_experts) - for shard_id, weight_name in [ - ("w1", ckpt_gate_proj_name), - ("w2", ckpt_down_proj_name), - ("w3", ckpt_up_proj_name), - ] - ] diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index ce8234862bb9..d0d0f10f3d90 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -872,25 +872,31 @@ def make_expert_params_mapping( """ Create expert parameter mapping for weight loading with redundant experts. - In the returned mapping: - - `expert_id` is the physical expert id - - `weight_name` contains the weight name of the logical expert - So that we map the expert id to logical in `weight_name` + This mapping handles the physical-to-logical expert ID conversion needed + when loading weights with EPLB redundant experts. Args: model: The model containing the MoE layer - ckpt_gate_proj_name: Checkpoint parameter name for gate projection - ckpt_down_proj_name: Checkpoint parameter name for down projection - ckpt_up_proj_name: Checkpoint parameter name for up projection - num_experts: Number of logical experts - num_redundant_experts: Number of redundant experts for EPLB + ckpt_gate_proj_name: Name of gate projection in checkpoint + ckpt_down_proj_name: Name of down projection in checkpoint + ckpt_up_proj_name: Name of up projection in checkpoint + num_experts: Number of logical (non-redundant) experts + num_redundant_experts: Number of redundant experts Returns: - List of (param_name, weight_name, expert_id, shard_id) tuples + List of tuples (param_name, weight_name, expert_id, shard_id) + where: + - param_name: Parameter name in the layer + - weight_name: Weight name in checkpoint + - expert_id: Physical expert ID + - shard_id: Shard identifier (w1, w2, w3) """ num_physical_experts = num_experts + num_redundant_experts - # Build initial physical-to-logical mapping + # In the returned mapping: + # - `expert_id` is the physical expert id + # - `weight_name` contains the weight name of the logical expert + # So that we should map the expert id to logical in `weight_name` physical_to_logical_map = ( EplbState.build_initial_global_physical_to_logical_map( num_experts, num_redundant_experts @@ -906,10 +912,10 @@ def make_expert_params_mapping( return [ # (param_name, weight_name, expert_id, shard_id) ( - f".experts.routed_experts.{base_layer}w13_" + f"experts.routed_experts.{base_layer}w13_" if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name] - else f".experts.routed_experts.{base_layer}w2_", - f".experts.{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}", + else f"experts.routed_experts.{base_layer}w2_", + f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}", expert_id, shard_id, ) diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 550adf31607b..66b135cc30a6 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -697,15 +697,18 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: moe_offset = int(name.split(".")[-3]) vision_expert_start_idx = self.config.moe_num_experts[0] is_text_expert = moe_offset <= vision_expert_start_idx - 1 + routed_experts = ( + ".routed_experts" if ("w13_" in name or "w2_" in name) else "" + ) if is_text_expert: name = name.replace( - ".experts.", ".text_experts.routed_experts." + ".experts", f".text_experts{routed_experts}" ) else: delta = moe_offset - vision_expert_start_idx name = name.replace( f".experts.{moe_offset}", - f".vision_experts.routed_experts.{delta}", + f".vision_experts.{routed_experts}{delta}", ) for mapping in expert_params_mapping: From 960a8424e43caaac2922d1ceee2ca13d4583390f Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 5 May 2026 03:18:59 +0000 Subject: [PATCH 141/191] fix merge Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/config.py | 6 ++++-- vllm/model_executor/layers/fused_moe/routed_experts.py | 5 ----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 159c3c41429b..8d8a98f8e606 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -1307,9 +1307,11 @@ def __post_init__(self): "Turn it off by setting VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0" ) - if not self.is_act_and_mul and not current_platform.is_cuda_alike(): + if not self.is_act_and_mul and not ( + current_platform.is_cuda_alike() or current_platform.is_xpu() + ): raise NotImplementedError( - "is_act_and_mul=False is supported only for CUDA and ROCm for now" + "is_act_and_mul=False is supported only for CUDA, XPU and ROCm for now" ) @property diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index d0d0f10f3d90..b114c6869291 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -527,11 +527,6 @@ def weight_loader( return_success: bool = False, ) -> bool | None: quant_config_name = self.quant_config and self.quant_config.get_name() - - if quant_config_name == "humming": - assert hasattr(self.quant_method, "weight_schema") - quant_config_name = self.quant_method.weight_schema.quant_method - if quant_config_name == "gpt_oss_mxfp4": # (FIXME) for gpt-oss all experts are combined if "bias" in weight_name: From 90c74a86f07c19ed2e78f48447dcc83e0c1e7080 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 5 May 2026 15:12:28 +0000 Subject: [PATCH 142/191] move mapping fn back to FusedMoE Signed-off-by: Bill Nell --- .../layers/fused_moe/eplb_manager.py | 69 +------------------ vllm/model_executor/layers/fused_moe/layer.py | 65 +++++++++-------- 2 files changed, 39 insertions(+), 95 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/eplb_manager.py b/vllm/model_executor/layers/fused_moe/eplb_manager.py index 68137ccb01bc..d444664602e6 100644 --- a/vllm/model_executor/layers/fused_moe/eplb_manager.py +++ b/vllm/model_executor/layers/fused_moe/eplb_manager.py @@ -13,7 +13,7 @@ import torch -from vllm.distributed.eplb.eplb_state import EplbLayerState, EplbState +from vllm.distributed.eplb.eplb_state import EplbLayerState class EplbManager: @@ -177,70 +177,3 @@ def _maybe_make_contiguous( and not name.startswith("_routed_input_transform.") and not name.startswith("_routed_output_transform.") ] - - @staticmethod - def make_expert_params_mapping( - model: torch.nn.Module, - ckpt_gate_proj_name: str, - ckpt_down_proj_name: str, - ckpt_up_proj_name: str, - num_experts: int, - num_redundant_experts: int = 0, - ) -> list[tuple[str, str, int, str]]: - """ - Create expert parameter mapping for weight loading with redundant experts. - - This mapping handles the physical-to-logical expert ID conversion needed - when loading weights with EPLB redundant experts. - - Args: - model: The model containing the MoE layer - ckpt_gate_proj_name: Name of gate projection in checkpoint - ckpt_down_proj_name: Name of down projection in checkpoint - ckpt_up_proj_name: Name of up projection in checkpoint - num_experts: Number of logical (non-redundant) experts - num_redundant_experts: Number of redundant experts - - Returns: - List of tuples (param_name, weight_name, expert_id, shard_id) - where: - - param_name: Parameter name in the layer - - weight_name: Weight name in checkpoint - - expert_id: Physical expert ID - - shard_id: Shard identifier (w1, w2, w3) - """ - num_physical_experts = num_experts + num_redundant_experts - - # In the returned mapping: - # - `expert_id` is the physical expert id - # - `weight_name` contains the weight name of the logical expert - # So that we should map the expert id to logical in `weight_name` - physical_to_logical_map = ( - EplbState.build_initial_global_physical_to_logical_map( - num_experts, num_redundant_experts - ) - ) - - base_layer = ( - "base_layer." - if any(".base_layer." in name for name, _ in model.named_parameters()) - else "" - ) - - return [ - # (param_name, weight_name, expert_id, shard_id) - ( - f"experts.{base_layer}w13_" - if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name] - else f"experts.{base_layer}w2_", - f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}", - expert_id, - shard_id, - ) - for expert_id in range(num_physical_experts) - for shard_id, weight_name in [ - ("w1", ckpt_gate_proj_name), - ("w2", ckpt_down_proj_name), - ("w3", ckpt_up_proj_name), - ] - ] diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 069215bd4cc3..086d5a93be56 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -16,6 +16,7 @@ get_pcp_group, get_tensor_model_parallel_world_size, ) +from vllm.distributed.eplb.eplb_state import EplbState from vllm.logger import init_logger from vllm.model_executor.custom_op import PluggableLayer from vllm.model_executor.layers.fused_moe.activation import MoEActivation @@ -539,9 +540,11 @@ def _get_quant_method() -> FusedMoEMethodBase: # for heuristic purposes, so it must be initialized first. self.quant_method: FusedMoEMethodBase = _get_quant_method() - if not self.moe_config.is_act_and_mul and not current_platform.is_cuda_alike(): + if not self.moe_config.is_act_and_mul and not ( + current_platform.is_cuda_alike() or current_platform.is_xpu() + ): raise NotImplementedError( - "is_act_and_mul=False is supported only for CUDA and ROCm for now" + "is_act_and_mul=False is supported only for CUDA and XPU for now" ) if enable_eplb and not self.quant_method.supports_eplb: @@ -1104,9 +1107,6 @@ def weight_loader( return_success: bool = False, ) -> bool | None: quant_config_name = self.quant_config and self.quant_config.get_name() - if quant_config_name == "humming": - assert hasattr(self.quant_method, "weight_schema") - quant_config_name = self.quant_method.weight_schema.quant_method if quant_config_name == "gpt_oss_mxfp4": # (FIXME) for gpt-oss all experts are combined if "bias" in weight_name: @@ -1519,31 +1519,42 @@ def make_expert_params_mapping( num_experts: int, num_redundant_experts: int = 0, ) -> list[tuple[str, str, int, str]]: - """ - Create expert parameter mapping for weight loading. - - Delegates to EplbManager for proper handling of redundant experts. - - Args: - model: The model containing the MoE layer - ckpt_gate_proj_name: Name of gate projection in checkpoint - ckpt_down_proj_name: Name of down projection in checkpoint - ckpt_up_proj_name: Name of up projection in checkpoint - num_experts: Number of logical (non-redundant) experts - num_redundant_experts: Number of redundant experts + num_physical_experts = num_experts + num_redundant_experts + + # In the returned mapping: + # - `expert_id` is the physical expert id + # - `weight_name` contains the weight name of the logical expert + # So that we should map the expert id to logical in `weight_name` + physical_to_logical_map = ( + EplbState.build_initial_global_physical_to_logical_map( + num_experts, num_redundant_experts + ) + ) - Returns: - List of tuples (param_name, weight_name, expert_id, shard_id) - """ - return EplbManager.make_expert_params_mapping( - model, - ckpt_gate_proj_name, - ckpt_down_proj_name, - ckpt_up_proj_name, - num_experts, - num_redundant_experts, + base_layer = ( + "base_layer." + if any(".base_layer." in name for name, _ in model.named_parameters()) + else "" ) + return [ + # (param_name, weight_name, expert_id, shard_id) + ( + f"experts.{base_layer}w13_" + if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name] + else f"experts.{base_layer}w2_", + f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}", + expert_id, + shard_id, + ) + for expert_id in range(num_physical_experts) + for shard_id, weight_name in [ + ("w1", ckpt_gate_proj_name), + ("w2", ckpt_down_proj_name), + ("w3", ckpt_up_proj_name), + ] + ] + @property def hidden_size(self) -> int: return self.moe_config.hidden_dim From eab0423a4130d9da017a61b08bb8d43e37d1382f Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 5 May 2026 17:05:48 +0000 Subject: [PATCH 143/191] cleanups Signed-off-by: Bill Nell --- .../layers/fused_moe/routed_experts.py | 94 ++++++++++++------- .../layers/fused_moe/runner/moe_runner.py | 4 +- 2 files changed, 64 insertions(+), 34 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index b114c6869291..9f5ce6814a37 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -3,7 +3,7 @@ from collections.abc import Iterable from enum import Enum -from typing import Any, Literal, overload +from typing import TYPE_CHECKING, Any, Literal, overload import torch from torch.nn.parameter import UninitializedParameter @@ -14,6 +14,7 @@ ) from vllm.distributed.eplb.eplb_state import EplbState from vllm.logger import init_logger +from vllm.model_executor.custom_op import PluggableLayer from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, ) @@ -30,6 +31,10 @@ QuantizationConfig, ) +if TYPE_CHECKING: + from vllm.model_executor.layers.fused_moe.runner.shared_experts import SharedExperts + + logger = init_logger(__name__) @@ -40,7 +45,8 @@ class FusedMoeWeightScaleSupported(Enum): BLOCK = "block" -class RoutedExperts(torch.nn.Module): +@PluggableLayer.register("routed_experts") +class RoutedExperts(PluggableLayer): """ Container for routed expert weights and execution logic. @@ -926,22 +932,18 @@ def make_expert_params_mapping( # Execution # - # TODO: split/overload this - def forward( + def forward_modular( self, x: torch.Tensor, - topk_weights: torch.Tensor | None = None, - topk_ids: torch.Tensor | None = None, - router_logits: torch.Tensor | None = None, - shared_experts: torch.nn.Module | None = None, # SharedExperts + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + shared_experts: "SharedExperts | None" = None, shared_experts_input: torch.Tensor | None = None, - input_ids: torch.Tensor | None = None, ) -> torch.Tensor: """ Execute routed experts using the quantization method's apply function. This is called by the runner after router selection (for modular kernels) - or with router logits (for monolithic kernels). It delegates to quant_method.apply() which accesses the weights on this RoutedExperts instance. @@ -949,34 +951,62 @@ def forward( x: Input tensor after any transforms topk_weights: Routing weights from router (for modular kernels) topk_ids: Selected expert IDs from router (for modular kernels) - router_logits: Router logits (for monolithic kernels) shared_experts: The shared experts (if any) shared_experts_input: Input for shared experts (if any) Returns: Output tensor from routed experts """ - quant_method = self.quant_method - - if quant_method.is_monolithic: - assert shared_experts is None - # Monolithic kernels handle routing internally - return quant_method.apply_monolithic( - layer=self, # Pass RoutedExperts as layer - x=x, - router_logits=router_logits, - input_ids=input_ids, - ) - else: - # Modular kernels use pre-computed routing - return quant_method.apply( - layer=self, # Pass RoutedExperts as layer - x=x, - topk_weights=topk_weights, - topk_ids=topk_ids, - shared_experts=shared_experts, - shared_experts_input=shared_experts_input, - ) + assert not self.quant_method.is_monolithic + + # Modular kernels use pre-computed routing + return self.quant_method.apply( + layer=self, + x=x, + topk_weights=topk_weights, + topk_ids=topk_ids, + shared_experts=shared_experts, + shared_experts_input=shared_experts_input, + ) + + def forward_monolithic( + self, + x: torch.Tensor, + router_logits: torch.Tensor | None = None, + input_ids: torch.Tensor | None = None, + ) -> torch.Tensor: + """ + Execute routed experts using the quantization method's apply function. + + This is called by the runner after router selection (for modular kernels) + or with router logits (for monolithic kernels). It delegates to + quant_method.apply() which accesses the weights on this RoutedExperts + instance. + + Args: + x: Input tensor after any transforms + router_logits: Router logits (for monolithic kernels) + input_ids: input ids for DeepSeek V4 + + Returns: + Output tensor from routed experts + """ + assert self.quant_method.is_monolithic + + # Monolithic kernels handle routing internally + return self.quant_method.apply_monolithic( + layer=self, + x=x, + router_logits=router_logits, + input_ids=input_ids, + ) + + def forward( + self, + *args, + **kwargs, + ) -> torch.Tensor: + raise AssertionError("Call forward_modular or forward_monolithic instead.") # Mark the RoutedExperts weight_loader as supporting MoE-specific parameters diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index 2b17e79aeae1..cff362389fdc 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -464,7 +464,7 @@ def _apply_quant_method( if self.routed_experts.quant_method.is_monolithic: # Monolithic kernels: pass router_logits to routed_experts - fused_out = self.routed_experts( + fused_out = self.routed_experts.forward_monolithic( x=hidden_states, router_logits=router_logits, input_ids=input_ids, @@ -478,7 +478,7 @@ def _apply_quant_method( input_ids=input_ids, ) - fused_out = self.routed_experts( + fused_out = self.routed_experts.forward_modular( x=hidden_states, topk_weights=topk_weights, topk_ids=topk_ids, From 26ffc77d2fb866c3ddf2eb365241895cef24f241 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 5 May 2026 17:33:50 +0000 Subject: [PATCH 144/191] fix dbrx Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 3 ++ vllm/model_executor/models/bailing_moe.py | 2 +- .../models/bailing_moe_linear.py | 2 +- vllm/model_executor/models/dbrx.py | 40 ++++++++----------- vllm/model_executor/models/exaone_moe.py | 2 +- vllm/model_executor/models/exaone_moe_mtp.py | 2 +- vllm/model_executor/models/hunyuan_v1.py | 2 +- vllm/model_executor/models/olmoe.py | 2 +- vllm/model_executor/models/qwen2_moe.py | 2 +- vllm/model_executor/models/qwen3_5.py | 2 +- vllm/model_executor/models/qwen3_5_mtp.py | 2 +- vllm/model_executor/models/qwen3_moe.py | 2 +- vllm/model_executor/models/qwen3_next.py | 2 +- vllm/model_executor/models/qwen3_next_mtp.py | 2 +- vllm/model_executor/models/qwen3_vl_moe.py | 2 +- vllm/model_executor/models/sarvam.py | 2 +- 16 files changed, 34 insertions(+), 37 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index de05cab28a0d..a9d2361a631e 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable +from typing import Any import torch @@ -151,6 +152,7 @@ def FusedMoE( zero_expert_type: str | None = None, hash_indices_table: torch.Tensor | None = None, routed_experts_cls: type[RoutedExperts] | None = None, + routed_experts_args: dict[str, Any] | None = None, ) -> MoERunner: # TODO update comment """FusedMoE layer builder for MoE models. @@ -327,6 +329,7 @@ def FusedMoE( e_score_correction_bias=e_score_correction_bias, apply_router_weight_on_input=apply_router_weight_on_input, activation=moe_activation, + **routed_experts_args if routed_experts_args is not None else {}, ) runner = MoERunner( diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index a6e178ae4b69..56e119207dae 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -493,7 +493,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue - if "mlp.experts" in name: # XXXXXXXXXXXXX + if "mlp.experts" in name: continue name = name.replace(weight_name, param_name) # Skip loading extra bias for GPTQ models. diff --git a/vllm/model_executor/models/bailing_moe_linear.py b/vllm/model_executor/models/bailing_moe_linear.py index 3c93195540bc..7aeb9674e470 100644 --- a/vllm/model_executor/models/bailing_moe_linear.py +++ b/vllm/model_executor/models/bailing_moe_linear.py @@ -1099,7 +1099,7 @@ def normalize_name(name: str) -> str | None: continue # Handle expert weights - if "mlp.experts" in norm_name: # XXXXXXXXXXXXXXXXXXXX + if "mlp.experts" in norm_name: # Expert bias if ( "mlp.experts.e_score_correction_bias" in norm_name diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index b14f915068a4..934c0539186f 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -17,6 +17,7 @@ from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import ( FusedMoE, + RoutedExperts, ) from vllm.model_executor.layers.linear import ( QKVParallelLinear, @@ -73,33 +74,19 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return router_logits -# XXXXXXXXXXXXXXXXXXXXXXXXXXXXX -class DbrxExperts(torch.nn.Module): +class DbrxExperts(RoutedExperts): def __init__( self, config: DbrxConfig, - quant_config: QuantizationConfig | None = None, - params_dtype: torch.dtype | None = None, - prefix: str = "", + *args, + **kwargs, ): - self.moe = FusedMoE( - num_experts=config.ffn_config.moe_num_experts, - top_k=config.ffn_config.moe_top_k, - hidden_size=config.d_model, - intermediate_size=config.ffn_config.ffn_hidden_size, - params_dtype=params_dtype, - renormalize=True, - quant_config=quant_config, - tp_size=get_tensor_model_parallel_world_size(), - prefix=prefix, - ) + super().__init__(*args, **kwargs) self.config = config self.d_model = config.d_model + self.tp_size = self.moe_config.tp_size self.intermediate_size = self.config.ffn_config.ffn_hidden_size // self.tp_size - def forward(self, router_logits, hidden_states) -> torch.Tensor: - return self.moe(hidden_states, router_logits) - # Define custom weight loader for dbrx model def weight_loader( self, @@ -172,11 +159,18 @@ def __init__( self.router = DbrxRouter(config, self.params_dtype) - self.experts = DbrxExperts( - config=config, - quant_config=quant_config, + self.experts = FusedMoE( + num_experts=config.ffn_config.moe_num_experts, + top_k=config.ffn_config.moe_top_k, + hidden_size=config.d_model, + intermediate_size=config.ffn_config.ffn_hidden_size, params_dtype=self.params_dtype, - prefix=f"{prefix}.experts", + renormalize=True, + quant_config=quant_config, + tp_size=get_tensor_model_parallel_world_size(), + prefix=prefix, + routed_experts_cls=DbrxExperts, + routed_experts_args={"config": config}, ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/models/exaone_moe.py b/vllm/model_executor/models/exaone_moe.py index 81d5bac0c39d..80b7e0957e82 100644 --- a/vllm/model_executor/models/exaone_moe.py +++ b/vllm/model_executor/models/exaone_moe.py @@ -389,7 +389,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue - if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXXXXX + if "mlp.experts" in name: continue name = name.replace(weight_name, param_name) # Skip loading extra bias for GPTQ models. diff --git a/vllm/model_executor/models/exaone_moe_mtp.py b/vllm/model_executor/models/exaone_moe_mtp.py index cbba3d6d0432..b3f8552aac58 100644 --- a/vllm/model_executor/models/exaone_moe_mtp.py +++ b/vllm/model_executor/models/exaone_moe_mtp.py @@ -149,7 +149,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if weight_name not in name: continue - if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXX + if "mlp.experts" in name: continue name = name.replace(weight_name, param_name) diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index 16523d3ae4c9..fca801b74823 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -785,7 +785,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue - if "mlp.experts" in name: # XXXXXXXXXXXXX + if "mlp.experts" in name: continue # cross layer only have q_proj, skip qkv pack if weight_name == ".q_proj": diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 9de793a5f75e..1f342ad1733d 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -369,7 +369,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # name will be updated to mlp.experts[0].gate_up_proj, which # will then be updated below in expert_params_mapping # for mlp.experts[0].gate_gate_up_proj, which breaks load. - if "mlp.experts" in name: # XXXXXXXXXXXXXXXX + if "mlp.experts" in name: continue name = name.replace(weight_name, param_name) # Skip loading extra bias for GPTQ models. diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 27ea6ac6f5be..77eea390eda9 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -453,7 +453,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # name will be updated to mlp.experts[0].gate_up_proj, which # will then be updated below in expert_params_mapping # for mlp.experts[0].gate_gate_up_proj, which breaks load. - if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXXXX + if "mlp.experts" in name: continue name = name.replace(weight_name, param_name) # Skip loading extra bias for GPTQ models. diff --git a/vllm/model_executor/models/qwen3_5.py b/vllm/model_executor/models/qwen3_5.py index 91dc807547b8..2449724cd04f 100644 --- a/vllm/model_executor/models/qwen3_5.py +++ b/vllm/model_executor/models/qwen3_5.py @@ -337,7 +337,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if weight_name not in name: continue - if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXXXXXXXX + if "mlp.experts" in name: continue name = name.replace(weight_name, param_name) diff --git a/vllm/model_executor/models/qwen3_5_mtp.py b/vllm/model_executor/models/qwen3_5_mtp.py index f955a89cce81..e86b205b9f31 100644 --- a/vllm/model_executor/models/qwen3_5_mtp.py +++ b/vllm/model_executor/models/qwen3_5_mtp.py @@ -231,7 +231,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if weight_name not in name: continue - if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXXXXXXX + if "mlp.experts" in name: continue name = name.replace(weight_name, param_name) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index c5cabdfb282c..4ec1be3367d8 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -579,7 +579,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # name will be updated to mlp.experts[0].gate_up_proj, which # will then be updated below in expert_params_mapping # for mlp.experts[0].gate_gate_up_proj, which breaks load. - if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXXXXXXXX + if "mlp.experts" in name: continue name = name.replace(weight_name, param_name) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index b25c6aa80613..96d7e9c713c6 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -575,7 +575,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if weight_name not in name: continue - if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXX + if "mlp.experts" in name: continue name = name.replace(weight_name, param_name) diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py index 56991bb1e2ae..2f411c48a631 100644 --- a/vllm/model_executor/models/qwen3_next_mtp.py +++ b/vllm/model_executor/models/qwen3_next_mtp.py @@ -165,7 +165,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if weight_name not in name: continue - if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXX + if "mlp.experts" in name: continue name = name.replace(weight_name, param_name) diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index 73decdebe4f6..7a3cda1ccbf1 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -211,7 +211,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # name will be updated to mlp.experts[0].gate_up_proj, which # will then be updated below in expert_params_mapping # for mlp.experts[0].gate_gate_up_proj, which breaks load. - if "mlp.experts" in name: # XXXXXXXXXXXXXXXXXXX + if "mlp.experts" in name: continue name = name.replace(weight_name, param_name) # Skip loading extra parameters for GPTQ/modelopt models. diff --git a/vllm/model_executor/models/sarvam.py b/vllm/model_executor/models/sarvam.py index d4ed372630cc..fd28e3b3914d 100644 --- a/vllm/model_executor/models/sarvam.py +++ b/vllm/model_executor/models/sarvam.py @@ -560,7 +560,7 @@ def load_weights( for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue - if "mlp.experts" in name: # XXXXXXXXXXXXXXXXX + if "mlp.experts" in name: continue new_name = name.replace(weight_name, param_name) if new_name.endswith(".bias") and new_name not in params_dict: From b33b31e3577e87a75296ccd0f4e8846925d0194c Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 5 May 2026 19:36:01 +0000 Subject: [PATCH 145/191] review comments Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/fused_moe_method_base.py | 2 +- vllm/model_executor/layers/quantization/gptq_marlin.py | 1 + vllm/model_executor/layers/quantization/quark/quark_moe.py | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py index b68279d2cbe4..601d64b792e6 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py @@ -22,7 +22,7 @@ ) if TYPE_CHECKING: - from vllm.model_executor.layers.runner.shared_experts import SharedExperts + from vllm.model_executor.layers.fused_moe.runner.shared_experts import SharedExperts logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index c11dffe6a76d..6dc28b664811 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -821,5 +821,6 @@ def apply( global_num_experts=layer.global_num_experts, apply_router_weight_on_input=layer.apply_router_weight_on_input, expert_map=layer.expert_map, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index b31ef8b1c934..3978de52ed4b 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -1435,6 +1435,7 @@ def apply( global_num_experts=layer.global_num_experts, apply_router_weight_on_input=layer.apply_router_weight_on_input, expert_map=layer.expert_map, + shared_experts=shared_experts, shared_experts_input=shared_experts_input, ) From 2a686f3913373164ab22fac198926a43ce45df08 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 5 May 2026 19:48:28 +0000 Subject: [PATCH 146/191] fix Signed-off-by: Bill Nell --- vllm/model_executor/models/dbrx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 934c0539186f..213d96a29961 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -77,8 +77,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class DbrxExperts(RoutedExperts): def __init__( self, - config: DbrxConfig, *args, + config: DbrxConfig, **kwargs, ): super().__init__(*args, **kwargs) From 0780907d3a6d9a51e84dfebc62e0e8d4cce01516 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 May 2026 18:20:43 +0000 Subject: [PATCH 147/191] review comments + redo stuff Signed-off-by: Bill Nell --- .../test_eplb_fused_moe_layer_dep_nvfp4.py | 3 +- tests/kernels/moe/test_moe_layer.py | 3 +- tests/kernels/moe/test_routing.py | 11 +- .../test_routed_experts_capture.py | 5 +- vllm/distributed/eplb/eplb_state.py | 11 ++ .../layers/fused_moe/eplb_manager.py | 178 ------------------ vllm/model_executor/layers/fused_moe/layer.py | 105 ++++++++--- .../layers/fused_moe/router/base_router.py | 16 +- .../fused_moe/router/custom_routing_router.py | 9 +- .../router/fused_topk_bias_router.py | 9 +- .../fused_moe/router/fused_topk_router.py | 9 +- .../fused_moe/router/grouped_topk_router.py | 9 +- .../layers/fused_moe/router/router_factory.py | 18 +- .../router/routing_simulator_router.py | 11 +- .../fused_moe/router/zero_expert_router.py | 9 +- .../layers/quantization/modelopt.py | 2 +- 16 files changed, 141 insertions(+), 267 deletions(-) delete mode 100644 vllm/model_executor/layers/fused_moe/eplb_manager.py diff --git a/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py index 68b2407c2e4b..9ab785af3135 100644 --- a/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py +++ b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py @@ -10,6 +10,7 @@ from tests.kernels.moe.utils import make_test_quant_config from vllm.config import VllmConfig, set_current_vllm_config +from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace from vllm.distributed.parallel_state import ( ensure_model_parallel_initialized, @@ -201,7 +202,7 @@ def _test_eplb_fml(env, world_size: int, test_config: TestConfig): dtype=torch.int32, device=device, ) - fml.enable_eplb = True + fml.eplb_state = EplbLayerState() fml.set_eplb_state( lidx, torch.zeros( diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index b79fa925c2cb..2b27202b6b6f 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -1021,7 +1021,6 @@ def make_fake_moe_layer( router = create_fused_moe_router( top_k=top_k, global_num_experts=global_num_experts, - # eplb_state=None, # TODO renormalize=renormalize, use_grouped_topk=use_grouped_topk, num_expert_group=num_expert_group, @@ -1254,7 +1253,7 @@ def _test_body_eplb( ), ) - eplb_moe_layer.eplb_manager.state.should_record_tensor = torch.ones( + eplb_moe_layer.eplb_state.should_record_tensor = torch.ones( (), dtype=torch.bool, device=device ) diff --git a/tests/kernels/moe/test_routing.py b/tests/kernels/moe/test_routing.py index 90a6cd841efd..41dea8121938 100644 --- a/tests/kernels/moe/test_routing.py +++ b/tests/kernels/moe/test_routing.py @@ -36,9 +36,11 @@ def _is_aiter_capable() -> bool: NUM_EXPERTS = [8, 16, 64] -def setup_eplb_state(enable_eplb: bool, global_num_experts: int) -> EplbLayerState: +def setup_eplb_state( + enable_eplb: bool, global_num_experts: int +) -> EplbLayerState | None: if not enable_eplb: - return EplbLayerState() + return None # Initialize EPLB state with proper tensors for testing # For testing purposes, we use a simple 1:1 mapping (no redundant experts) @@ -349,7 +351,6 @@ def test_fused_topk( top_k=top_k, global_num_experts=global_num_experts, renormalize=renormalize, - enable_eplb=enable_eplb, eplb_state=eplb_state, ) @@ -400,7 +401,6 @@ def test_fused_topk_bias( top_k=top_k, global_num_experts=global_num_experts, renormalize=renormalize, - enable_eplb=enable_eplb, eplb_state=eplb_state, ) @@ -469,7 +469,6 @@ def test_grouped_topk( top_k=top_k, global_num_experts=global_num_experts, renormalize=renormalize, - enable_eplb=enable_eplb, eplb_state=eplb_state, ) @@ -540,7 +539,6 @@ def test_custom( global_num_experts=global_num_experts, custom_routing_function=custom_routing_function, renormalize=renormalize, - enable_eplb=enable_eplb, eplb_state=eplb_state, ) @@ -580,7 +578,6 @@ def test_custom( # router = create_fused_moe_router( # top_k=top_k, # global_num_experts=global_num_experts, -# enable_eplb=enable_eplb, # eplb_state=eplb_state, # ) diff --git a/tests/model_executor/test_routed_experts_capture.py b/tests/model_executor/test_routed_experts_capture.py index 0527417d1506..656661ee2b24 100644 --- a/tests/model_executor/test_routed_experts_capture.py +++ b/tests/model_executor/test_routed_experts_capture.py @@ -57,8 +57,7 @@ def _make_router() -> DummyRouter: return DummyRouter( top_k=2, global_num_experts=16, - eplb_state=EplbLayerState(), - enable_eplb=False, + eplb_state=None, indices_type_getter=None, ) @@ -84,7 +83,7 @@ def capture_fn(ids): def test_base_router_capture_with_eplb_enabled(): router = _make_router() - router.enable_eplb = True + router.eplb_state = EplbLayerState() router.eplb_state.expert_load_view = torch.zeros(32, dtype=torch.int64) router.eplb_state.logical_to_physical_map = torch.arange(32).view(32, 1) router.eplb_state.logical_replica_count = torch.ones(32, dtype=torch.int64) diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 1da39caccd80..319a5f22c922 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -940,6 +940,17 @@ class EplbLayerState: GPU work. """ + def set_layer_state( + self, + moe_layer_idx: int, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + self.expert_load_view = expert_load_view[moe_layer_idx] + self.logical_to_physical_map = logical_to_physical_map[moe_layer_idx] + self.logical_replica_count = logical_replica_count[moe_layer_idx] + def _node_count_with_rank_mapping( pg: ProcessGroup | StatelessProcessGroup, diff --git a/vllm/model_executor/layers/fused_moe/eplb_manager.py b/vllm/model_executor/layers/fused_moe/eplb_manager.py deleted file mode 100644 index a5f349472e66..000000000000 --- a/vllm/model_executor/layers/fused_moe/eplb_manager.py +++ /dev/null @@ -1,178 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -""" -EPLB (Expert Parallelism Load Balancing) Manager. - -This module provides the EplbManager class which encapsulates all EPLB-related -functionality for MoE layers, including state management, expert weight -collection, and expert parameter mapping. -""" - -from collections.abc import Iterable - -import torch - -from vllm.distributed.eplb.eplb_state import EplbLayerState - - -class EplbManager: - """ - Manages Expert Parallelism Load Balancing (EPLB) state and operations - for a MoE layer. - - This class encapsulates all EPLB-related functionality including: - - Runtime state (expert load view, logical-to-physical mapping) - - Expert weight collection for load balancing - - Expert parameter mapping for weight loading with redundant experts - - Validation of EPLB configuration constraints - """ - - def __init__( - self, - ep_size: int, - global_num_experts: int, - logical_num_experts: int, - num_redundant_experts: int = 0, - ): - """ - Initialize EPLB manager. - - Args: - ep_size: Expert parallel world size - global_num_experts: Total number of experts (including redundant) - logical_num_experts: Number of logical (non-redundant) experts - num_redundant_experts: Number of redundant experts - """ - self.ep_size = ep_size - self.global_num_experts = global_num_experts - self.logical_num_experts = logical_num_experts - self.num_redundant_experts = num_redundant_experts - - # Runtime EPLB state. - self.state = EplbLayerState() - - # Validate EPLB configuration. - # EPLB currently only supports even distribution of experts across ranks - if self.global_num_experts % self.ep_size != 0: - raise ValueError( - f"EPLB currently only supports even distribution of " - f"experts across ranks. Got {self.global_num_experts} experts " - f"and {self.ep_size} EP ranks." - ) - - def set_state( - self, - moe_layer_idx: int, - expert_load_view: torch.Tensor, - logical_to_physical_map: torch.Tensor, - logical_replica_count: torch.Tensor, - ) -> None: - """ - Register the EPLB state for this layer. - - This is used later in forward pass, where we get the expert mapping - and record the load metrics in `expert_load_view`. - - Args: - moe_layer_idx: Index of this MoE layer - expert_load_view: View into global expert load tracking tensor - logical_to_physical_map: Mapping from logical to physical expert IDs - logical_replica_count: Number of replicas for each logical expert - """ - self.state.expert_load_view = expert_load_view[moe_layer_idx] - self.state.logical_to_physical_map = logical_to_physical_map[moe_layer_idx] - self.state.logical_replica_count = logical_replica_count[moe_layer_idx] - - @staticmethod - def get_expert_weights( - layer: torch.nn.Module, # FusedMoE - ) -> Iterable[torch.Tensor]: - """ - Collect expert weights from the MoE layer for EPLB. - - Returns weights reshaped as (local_num_experts, -1) for efficient - expert weight swapping during load balancing. - - Args: - layer: The FusedMoE layer to collect weights from - - Returns: - Iterable of expert weight tensors - """ - - def _maybe_make_contiguous( - name: str, p: torch.nn.Parameter - ) -> torch.nn.Parameter: - """ - In some cases, the last 2 dimensions (the non-expert dimensions) - of the weight scale tensor are transposed. This function - transforms the tensor (view update) so the tensor is contiguous(). - Example: A non-contiguous scale tensor, - `x` of shape (E, 32, 16) and stride (512, 1, 32) is transformed to - `x_` of shape (E, 16, 32) and stride (512, 32, 1). - Note that we specifically use torch.transpose() so `x_` refers - to the same underlying memory. The tensors `x` and `x_`, pointing - to the same underlying memory make this transformation safe in the - context of EPLB. i.e. It is the same memory and just the view - is different. - Note: This function handles the "weight_scale" tensors specifically. - This could however be generalized to handle similar tensors. - """ - if p.ndim != 3: - return p - if p.is_contiguous(): - # Already contiguous. do nothing. - return p - # p is non-contiguous. We only handle the case where the last 2 - # dimensions of the scales tensor is transposed. We can handle - # other cases when they become relevant. - is_transposed_12 = p.stride(1) == 1 and p.stride(2) != 1 - if "weight_scale" not in name or not is_transposed_12: - # do nothing. - return p - - # Do not update the layer parameter as the layer's MoE operations would - # expect the parameter's tensor to the same shape / stride. Instead, - # make a new torch.nn.Parameter that is used just in the context of - # EPLB. - return torch.nn.Parameter( - torch.transpose(p.data, 1, 2), requires_grad=False - ) - - weights = list(layer.named_parameters()) - weights = [(name, _maybe_make_contiguous(name, p)) for name, p in weights] - - # `w13_input_scale` and `w2_input_scale` are global per-tensor - # activation scales shared across all experts (e.g. NVFP4). - # They are broadcast views (stride 0) from .expand() and are - # not actual expert weights, so exclude them from EPLB. - NON_EXPERT_WEIGHTS = { - "e_score_correction_bias", - "w13_input_scale", - "w2_input_scale", - } - - # Parameters of non-expert submodules that live inside runner (MoERunner). - # These must be excluded from EPLB weight rearrangement. - NON_EXPERT_PREFIXES = ( - "runner._shared_experts.", - "runner.gate.", - "runner.routed_input_transform.", - "runner.routed_output_transform.", - ) - - assert all( - weight.is_contiguous() - for name, weight in weights - if not name.startswith(NON_EXPERT_PREFIXES) - and name not in NON_EXPERT_WEIGHTS - ) - - return [ - weight.view(layer.local_num_experts, -1) - for name, weight in weights - if name not in NON_EXPERT_WEIGHTS - and weight.shape != torch.Size([]) - and not name.startswith(NON_EXPERT_PREFIXES) - ] diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 086d5a93be56..37fbfe5c658c 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -16,7 +16,7 @@ get_pcp_group, get_tensor_model_parallel_world_size, ) -from vllm.distributed.eplb.eplb_state import EplbState +from vllm.distributed.eplb.eplb_state import EplbLayerState, EplbState from vllm.logger import init_logger from vllm.model_executor.custom_op import PluggableLayer from vllm.model_executor.layers.fused_moe.activation import MoEActivation @@ -26,7 +26,6 @@ FusedMoEQuantConfig, RoutingMethodType, ) -from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, ) @@ -347,14 +346,15 @@ def __init__( ) # Create EPLB manager (always constructed for consistent API) - self.eplb_manager: EplbManager | None = None + self.eplb_state: EplbLayerState | None = None if enable_eplb: - self.eplb_manager = EplbManager( - ep_size=self.moe_parallel_config.ep_size, - global_num_experts=self.global_num_experts, - logical_num_experts=self.logical_num_experts, - num_redundant_experts=num_redundant_experts, - ) + if self.global_num_experts % self.ep_size != 0: + raise ValueError( + f"EPLB currently only supports even distribution of " + f"experts across ranks. Got {self.global_num_experts} experts " + f"and {self.ep_size} EP ranks." + ) + self.eplb_state = EplbLayerState() else: # EPLB validation is handled by EplbManager.__init__ assert not self.use_ep or num_redundant_experts == 0, ( @@ -470,7 +470,7 @@ def __init__( self.router = create_fused_moe_router( top_k=top_k, global_num_experts=self.global_num_experts, - eplb_manager=self.eplb_manager, + eplb_state=self.eplb_state, renormalize=renormalize, use_grouped_topk=use_grouped_topk, num_expert_group=num_expert_group, @@ -1436,18 +1436,81 @@ def load_weights( yield param_name def get_expert_weights(self) -> Iterable[torch.Tensor]: - """ - Collect expert weights for EPLB load balancing. + def _maybe_make_contiguous( + name: str, p: torch.nn.Parameter + ) -> torch.nn.Parameter: + """ + In some cases, the last 2 dimensions (the non-expert dimensions) + of the weight scale tensor are transposed. This function + transforms the tensor (view update) so the tensor is contiguous(). + Example: A non-contiguous scale tensor, + `x` of shape (E, 32, 16) and stride (512, 1, 32) is transformed to + `x_` of shape (E, 16, 32) and stride (512, 32, 1). + Note that we specifically use torch.transpose() so `x_` refers + to the same underlying memory. The tensors `x` and `x_`, pointing + to the same underlying memory make this transformation safe in the + context of EPLB. i.e. It is the same memory and just the view + is different. + Note: This function handles the "weight_scale" tensors specifically. + This could however be generalized to handle similar tensors. + """ + if p.ndim != 3: + return p + if p.is_contiguous(): + # Already contiguous. do nothing. + return p + # p is non-contiguous. We only handle the case where the last 2 + # dimensions of the scales tensor is transposed. We can handle + # other cases when they become relevant. + is_transposed_12 = p.stride(1) == 1 and p.stride(2) != 1 + if "weight_scale" not in name or not is_transposed_12: + # do nothing. + return p + + # Do not update the layer parameter as the layer's MoE operations would + # expect the parameter's tensor to the same shape / stride. Instead, + # make a new torch.nn.Parameter that is used just in the context of + # EPLB. + return torch.nn.Parameter( + torch.transpose(p.data, 1, 2), requires_grad=False + ) - Returns weights reshaped as (local_num_experts, -1) for efficient - expert weight swapping during load balancing. + weights = list(self.named_parameters()) + weights = [(name, _maybe_make_contiguous(name, p)) for name, p in weights] + + # `w13_input_scale` and `w2_input_scale` are global per-tensor + # activation scales shared across all experts (e.g. NVFP4). + # They are broadcast views (stride 0) from .expand() and are + # not actual expert weights, so exclude them from EPLB. + NON_EXPERT_WEIGHTS = { + "e_score_correction_bias", + "w13_input_scale", + "w2_input_scale", + } - Delegates to EplbManager. + # Parameters of non-expert submodules that live inside runner (MoERunner). + # These must be excluded from EPLB weight rearrangement. + NON_EXPERT_PREFIXES = ( + "runner._shared_experts.", + "runner.gate.", + "runner.routed_input_transform.", + "runner.routed_output_transform.", + ) - Returns: - Iterable of expert weight tensors - """ - return EplbManager.get_expert_weights(self) + assert all( + weight.is_contiguous() + for name, weight in weights + if not name.startswith(NON_EXPERT_PREFIXES) + and name not in NON_EXPERT_WEIGHTS + ) + + return [ + weight.view(self.local_num_experts, -1) + for name, weight in weights + if name not in NON_EXPERT_WEIGHTS + and weight.shape != torch.Size([]) + and not name.startswith(NON_EXPERT_PREFIXES) + ] def set_eplb_state( self, @@ -1470,8 +1533,8 @@ def set_eplb_state( logical_to_physical_map: Mapping from logical to physical expert IDs logical_replica_count: Number of replicas for each logical expert """ - if self.eplb_manager is not None: - self.eplb_manager.set_state( + if self.eplb_state is not None: + self.eplb_state.set_layer_state( moe_layer_idx, expert_load_view, logical_to_physical_map, diff --git a/vllm/model_executor/layers/fused_moe/router/base_router.py b/vllm/model_executor/layers/fused_moe/router/base_router.py index e32816b395b9..ff023c1cd19c 100644 --- a/vllm/model_executor/layers/fused_moe/router/base_router.py +++ b/vllm/model_executor/layers/fused_moe/router/base_router.py @@ -5,7 +5,7 @@ import torch -from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager +from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( FusedMoERouter, ) @@ -148,7 +148,7 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_manager: EplbManager | None = None, + eplb_state: EplbLayerState | None = None, # TODO(bnell): Once the MK is constructed at layer init time, we # can make this a plain value instead of a callback. indices_type_getter: Callable[[], torch.dtype | None] | None = None, @@ -162,13 +162,13 @@ def __init__( Args: top_k: Number of experts to select per token global_num_experts: Total number of experts - eplb_manager: Optional EPLB manager for load balancing + eplb_state: Optional EPLBLayerState for load balancing indices_type_getter: Optional callback to get indices dtype """ super().__init__() self.top_k = top_k self.global_num_experts = global_num_experts - self.eplb_manager = eplb_manager + self.eplb_state = eplb_state self.indices_type_getter = indices_type_getter self.capture_fn: Callable[[torch.Tensor], None] | None = None @@ -178,8 +178,8 @@ def set_capture_fn(self, capture_fn: Callable[[torch.Tensor], None] | None) -> N def _validate_eplb_state(self) -> None: """Validate that EPLB state is properly initialized if EPLB is enabled.""" - if self.eplb_manager is not None: - eplb_state = self.eplb_manager.state + if self.eplb_state is not None: + eplb_state = self.eplb_state if eplb_state.expert_load_view is None: raise ValueError("enable_eplb=True requires expert_load_view != None") if eplb_state.logical_to_physical_map is None: @@ -203,8 +203,8 @@ def _get_indices_type(self) -> torch.dtype | None: def _apply_eplb_mapping(self, topk_ids: torch.Tensor) -> torch.Tensor: """Apply EPLB mapping to convert logical expert IDs to physical expert IDs.""" - if self.eplb_manager is not None: - eplb_state = self.eplb_manager.state + if self.eplb_state is not None: + eplb_state = self.eplb_state assert eplb_state.expert_load_view is not None assert eplb_state.logical_to_physical_map is not None assert eplb_state.logical_replica_count is not None diff --git a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py index 00e3e7520031..6983a385a0af 100644 --- a/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py +++ b/vllm/model_executor/layers/fused_moe/router/custom_routing_router.py @@ -1,16 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable -from typing import TYPE_CHECKING import torch +from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.fused_moe.config import RoutingMethodType from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter -if TYPE_CHECKING: - from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager - class CustomRoutingRouter(BaseRouter): """Router using a custom user-provided routing function.""" @@ -20,14 +17,14 @@ def __init__( top_k: int, global_num_experts: int, custom_routing_function: Callable, - eplb_manager: "EplbManager | None" = None, + eplb_state: EplbLayerState | None = None, renormalize: bool = True, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, - eplb_manager=eplb_manager, + eplb_state=eplb_state, indices_type_getter=indices_type_getter, ) self.custom_routing_function = custom_routing_function diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py index 6d3bd6ac5529..0ca5c3f97952 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools from collections.abc import Callable -from typing import TYPE_CHECKING import torch import torch.nn.functional as F @@ -10,15 +9,13 @@ import vllm._custom_ops as ops import vllm.envs as envs from vllm._aiter_ops import rocm_aiter_ops +from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.fused_moe.config import ( RoutingMethodType, get_routing_method_type, ) from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter -if TYPE_CHECKING: - from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager - def vllm_topk_softmax( topk_weights: torch.Tensor, @@ -241,7 +238,7 @@ def __init__( e_score_correction_bias: torch.Tensor | None = None, renormalize: bool = True, routed_scaling_factor: float = 1.0, - eplb_manager: "EplbManager | None" = None, + eplb_state: EplbLayerState | None = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, *, scoring_func: str = "sigmoid", @@ -250,7 +247,7 @@ def __init__( super().__init__( top_k=top_k, global_num_experts=global_num_experts, - eplb_manager=eplb_manager, + eplb_state=eplb_state, indices_type_getter=indices_type_getter, ) self.e_score_correction_bias = e_score_correction_bias diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py index d88786491d7b..a4800eabb908 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py @@ -1,21 +1,18 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable -from typing import TYPE_CHECKING import torch import vllm._custom_ops as ops from vllm._aiter_ops import rocm_aiter_ops +from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.fused_moe.config import ( RoutingMethodType, get_routing_method_type, ) from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter -if TYPE_CHECKING: - from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager - def vllm_topk_softmax( topk_weights: torch.Tensor, @@ -125,13 +122,13 @@ def __init__( global_num_experts: int, scoring_func: str = "softmax", renormalize: bool = True, - eplb_manager: "EplbManager | None" = None, + eplb_state: EplbLayerState | None = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, - eplb_manager=eplb_manager, + eplb_state=eplb_state, indices_type_getter=indices_type_getter, ) self.renormalize = renormalize diff --git a/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py b/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py index 461c5c351f05..77624a1b9077 100644 --- a/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py +++ b/vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py @@ -2,13 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable from functools import partial -from typing import TYPE_CHECKING import torch from vllm import _custom_ops as ops from vllm import envs as envs from vllm._aiter_ops import rocm_aiter_ops +from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.fused_moe.config import ( RoutingMethodType, @@ -25,9 +25,6 @@ from vllm.model_executor.utils import maybe_disable_graph_partition from vllm.platforms import current_platform -if TYPE_CHECKING: - from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager - def fused_grouped_topk( hidden_states: torch.Tensor, @@ -261,13 +258,13 @@ def __init__( routed_scaling_factor: float = 1.0, e_score_correction_bias: torch.Tensor | None = None, num_fused_shared_experts: int = 0, - eplb_manager: "EplbManager | None" = None, + eplb_state: EplbLayerState | None = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, - eplb_manager=eplb_manager, + eplb_state=eplb_state, indices_type_getter=indices_type_getter, ) self.num_expert_group = num_expert_group diff --git a/vllm/model_executor/layers/fused_moe/router/router_factory.py b/vllm/model_executor/layers/fused_moe/router/router_factory.py index 89592830b23b..debcf17edaa3 100644 --- a/vllm/model_executor/layers/fused_moe/router/router_factory.py +++ b/vllm/model_executor/layers/fused_moe/router/router_factory.py @@ -5,8 +5,8 @@ import torch import vllm.envs as envs +from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.fused_moe.config import RoutingMethodType -from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager from vllm.model_executor.layers.fused_moe.router.custom_routing_router import ( CustomRoutingRouter, ) @@ -48,7 +48,7 @@ def create_fused_moe_router( # custom routing parameters custom_routing_function: Callable | None = None, # eplb parameters - eplb_manager: EplbManager | None = None, + eplb_state: EplbLayerState | None = None, # zero expert parameters zero_expert_type: str | None = None, num_logical_experts: int | None = None, @@ -88,7 +88,7 @@ def create_fused_moe_router( custom_routing_function: Optional custom routing function EPLB arguments: - eplb_manager: Optional EPLB (Expert Parallelism Load Balancing) manager + eplb_state: Optional EplbLayerState, None when EPLB is disabled. Zero expert arguments: zero_expert_type: Type of zero expert (e.g. identity). If not None, @@ -108,7 +108,7 @@ def create_fused_moe_router( return RoutingSimulatorRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_manager=eplb_manager, + eplb_state=eplb_state, indices_type_getter=indices_type_getter, ) @@ -122,7 +122,7 @@ def create_fused_moe_router( return ZeroExpertRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_manager=eplb_manager, + eplb_state=eplb_state, e_score_correction_bias=e_score_correction_bias, num_logical_experts=num_logical_experts, zero_expert_type=zero_expert_type, @@ -142,7 +142,7 @@ def create_fused_moe_router( grouped_topk_router = GroupedTopKRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_manager=eplb_manager, + eplb_state=eplb_state, num_expert_group=num_expert_group, topk_group=topk_group, renormalize=renormalize, @@ -169,7 +169,7 @@ def create_fused_moe_router( return CustomRoutingRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_manager=eplb_manager, + eplb_state=eplb_state, custom_routing_function=custom_routing_function, renormalize=renormalize, indices_type_getter=indices_type_getter, @@ -181,7 +181,7 @@ def create_fused_moe_router( return FusedTopKBiasRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_manager=eplb_manager, + eplb_state=eplb_state, e_score_correction_bias=e_score_correction_bias, renormalize=renormalize, routed_scaling_factor=routed_scaling_factor, @@ -193,7 +193,7 @@ def create_fused_moe_router( return FusedTopKRouter( top_k=top_k, global_num_experts=global_num_experts, - eplb_manager=eplb_manager, + eplb_state=eplb_state, renormalize=renormalize, scoring_func=scoring_func, indices_type_getter=indices_type_getter, diff --git a/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py b/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py index 7d0b8ba8b61a..233dc82667c8 100644 --- a/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py +++ b/vllm/model_executor/layers/fused_moe/router/routing_simulator_router.py @@ -2,19 +2,16 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from collections.abc import Callable -from typing import TYPE_CHECKING, Any +from typing import Any import torch import vllm.envs as envs +from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import RoutingMethodType from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter -if TYPE_CHECKING: - from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager - - logger = init_logger(__name__) @@ -316,13 +313,13 @@ def __init__( self, top_k: int, global_num_experts: int, - eplb_manager: "EplbManager | None" = None, + eplb_state: EplbLayerState | None = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, - eplb_manager=eplb_manager, + eplb_state=eplb_state, indices_type_getter=indices_type_getter, ) diff --git a/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py b/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py index d61056026c01..54f0fa4fb0ac 100644 --- a/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py +++ b/vllm/model_executor/layers/fused_moe/router/zero_expert_router.py @@ -2,10 +2,10 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable -from typing import TYPE_CHECKING import torch +from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.fused_moe.config import ( RoutingMethodType, get_routing_method_type, @@ -18,9 +18,6 @@ fused_topk_bias, ) -if TYPE_CHECKING: - from vllm.model_executor.layers.fused_moe.eplb_manager import EplbManager - class ZeroExpertRouter(BaseRouter): """Router that handles zero expert computation as part of routing. @@ -41,13 +38,13 @@ def __init__( scoring_func: str = "softmax", renormalize: bool = False, routed_scaling_factor: float = 1.0, - eplb_manager: "EplbManager | None" = None, + eplb_state: EplbLayerState | None = None, indices_type_getter: Callable[[], torch.dtype | None] | None = None, ): super().__init__( top_k=top_k, global_num_experts=global_num_experts, - eplb_manager=eplb_manager, + eplb_state=eplb_state, indices_type_getter=indices_type_getter, ) self.e_score_correction_bias = e_score_correction_bias diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 0862efbea294..5f137e778066 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1931,7 +1931,7 @@ def apply_monolithic( assert self.mxfp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM - if layer.enable_eplb: + if layer.eplb_state is not None: raise NotImplementedError( "EPLB is not supported for FlashInfer TRTLLM MXFP8 MoE backend." ) From c1a332c144f33f0a29f61514c50e68d611f690e0 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 May 2026 19:01:16 +0000 Subject: [PATCH 148/191] review comments Signed-off-by: Bill Nell --- .../layers/fused_moe/expert_map_manager.py | 75 ++++++++++--------- vllm/model_executor/layers/fused_moe/layer.py | 1 - 2 files changed, 40 insertions(+), 36 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 0e204f4d1fb8..4bdda09cc151 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -173,7 +173,6 @@ def __init__( enable_eplb: bool, num_fused_shared_experts: int = 0, rocm_aiter_enabled: bool = False, - device: torch.device | None = None, ): """ Initialize expert map manager. @@ -186,14 +185,12 @@ def __init__( placement_strategy: Strategy for placing experts ('linear' or 'round_robin') num_fused_shared_experts: Number of fused shared experts (for AITER) rocm_aiter_enabled: Whether ROCm AITER fusion is enabled - device: Device for tensor allocations """ self.global_num_experts = global_num_experts self.logical_num_experts = logical_num_experts self.moe_parallel_config = moe_parallel_config self.num_fused_shared_experts = num_fused_shared_experts self.rocm_aiter_enabled = rocm_aiter_enabled - self.device = device if moe_parallel_config.use_ep: # Determine expert placement strategy before creating manager @@ -229,7 +226,7 @@ def __init__( (expert_mask == 0) | (expert_mask == 1) ), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s." - # Log EP configuration (move into EMM?) + # Log EP configuration if self.use_ep: logger.info_once( "[EP Rank %s/%s] Expert parallelism is enabled. Expert " @@ -244,6 +241,15 @@ def __init__( self.get_compressed_map_string(), ) + @property + def device(self) -> torch.device: + if self._expert_map is not None: + return self._expert_map.device + elif self._expert_mask is not None: + return self._expert_mask.device + else: + raise RuntimeError("no device available") + def _init_aiter_shared_experts_topK_buffer( self, dp_size: int, @@ -380,28 +386,29 @@ def update( max_num_batched_tokens: New max batched tokens (if changed, for AITER buffer reinitialization) """ - if new_ep_size is not None: - self.moe_parallel_config.ep_size = new_ep_size - if new_ep_rank is not None: - self.moe_parallel_config.ep_rank = new_ep_rank + with self.device: + if new_ep_size is not None: + self.moe_parallel_config.ep_size = new_ep_size + if new_ep_rank is not None: + self.moe_parallel_config.ep_rank = new_ep_rank + + # Recalculate everything + self._placement_strategy = self._determine_placement_strategy( + self._placement_strategy + ) - # Recalculate everything - self._placement_strategy = self._determine_placement_strategy( - self._placement_strategy - ) + self._calculate_expert_maps() + self._maybe_init_routing_tables() - self._calculate_expert_maps() - self._maybe_init_routing_tables() - - # Reinitialize AITER buffer if needed and parameters provided - if self.num_fused_shared_experts > 0 and all( - x is not None for x in [dp_size, top_k, max_num_batched_tokens] - ): - self._init_aiter_shared_experts_topK_buffer( - dp_size=dp_size, # type: ignore - top_k=top_k, # type: ignore - max_num_batched_tokens=max_num_batched_tokens, # type: ignore - ) + # Reinitialize AITER buffer if needed and parameters provided + if self.num_fused_shared_experts > 0 and all( + x is not None for x in [dp_size, top_k, max_num_batched_tokens] + ): + self._init_aiter_shared_experts_topK_buffer( + dp_size=dp_size, # type: ignore + top_k=top_k, # type: ignore + max_num_batched_tokens=max_num_batched_tokens, # type: ignore + ) def get_compressed_map_string(self) -> str: """ @@ -469,13 +476,6 @@ def _calculate_expert_maps(self) -> None: self._local_num_experts += self.num_fused_shared_experts - # Move to device if specified - if self.device is not None: - if self._expert_map is not None: - self._expert_map = self._expert_map.to(self.device) - if self._expert_mask is not None: - self._expert_mask = self._expert_mask.to(self.device) - def ensure_routing_tables_initialized(self) -> None: """ Ensure routing tables are initialized if needed for round-robin. @@ -509,10 +509,13 @@ def _ensure_round_robin_expert_routing_tables( "Round robin not supported for AITER." ) - device_kwargs = {"device": self.device} if self.device is not None else {} + assert self._expert_map is not None + device = self._expert_map.device global_indices = torch.arange( - self.global_num_experts, dtype=torch.long, **device_kwargs + self.global_num_experts, + dtype=torch.long, + device=device, ) owner = torch.remainder(global_indices, self.ep_size) local_index = torch.div(global_indices, self.ep_size, rounding_mode="floor") @@ -523,7 +526,9 @@ def _ensure_round_robin_expert_routing_tables( if remainder > 0: remainder_tensor = torch.tensor( - remainder, dtype=torch.long, **device_kwargs + remainder, + dtype=torch.long, + device=device, ) physical_offset = physical_offset + torch.minimum(owner, remainder_tensor) @@ -536,7 +541,7 @@ def _ensure_round_robin_expert_routing_tables( self.global_num_experts, self.ep_size, dtype=torch.long, - **device_kwargs, + device=device, ) if local_global.numel() != self._local_num_experts: local_global = local_global[: self._local_num_experts] diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 13610e1ef3bd..2eee060ee735 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -251,7 +251,6 @@ def __init__( enable_eplb=self.enable_eplb, num_fused_shared_experts=self.num_fused_shared_experts, rocm_aiter_enabled=self.rocm_aiter_fmoe_enabled, - device=None, ) # Extract properties from ExpertMapManager From 4a3d996fb21835cbbc2c1289a1f38109903b3a6e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 May 2026 19:52:37 +0000 Subject: [PATCH 149/191] cleanup routing table initialization and updating Signed-off-by: Bill Nell --- tests/kernels/moe/test_moe.py | 2 +- .../layers/fused_moe/expert_map_manager.py | 10 +-- vllm/model_executor/layers/fused_moe/layer.py | 63 +++++++------------ .../fused_moe/unquantized_fused_moe_method.py | 2 +- .../compressed_tensors_moe_w4a4_mxfp4.py | 2 +- .../compressed_tensors_moe_w4a4_nvfp4.py | 2 +- .../compressed_tensors_moe_w8a8_fp8.py | 2 +- .../compressed_tensors_moe_w8a8_int8.py | 2 +- .../compressed_tensors_moe_w8a8_mxfp8.py | 2 +- .../model_executor/layers/quantization/fp8.py | 2 +- .../layers/quantization/gptq_marlin.py | 2 +- .../layers/quantization/modelopt.py | 4 +- .../layers/quantization/mxfp4.py | 4 +- .../layers/quantization/online/fp8.py | 2 +- .../layers/quantization/online/int8.py | 2 +- .../layers/quantization/online/mxfp8.py | 2 +- .../layers/quantization/quark/quark_moe.py | 2 +- 17 files changed, 42 insertions(+), 65 deletions(-) diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index ebc3256b548f..3978f1cbe7af 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -1588,7 +1588,7 @@ def test_unquantized_bf16_flashinfer_trtllm_backend( layer.apply_router_weight_on_input = False layer.routed_scaling_factor = None layer.shared_experts = None - layer._maybe_init_expert_routing_tables = lambda: None + layer._expert_routing_tables = lambda: None quant_method.process_weights_after_loading(layer) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 4bdda09cc151..79b1f0aa70b7 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -212,7 +212,7 @@ def __init__( self._calculate_expert_maps() # Initialize routing tables if needed - self._maybe_init_routing_tables() + self._ensure_routing_tables_initialized() self._init_aiter_shared_experts_topK_buffer( dp_size=self.moe_parallel_config.dp_size, @@ -398,7 +398,7 @@ def update( ) self._calculate_expert_maps() - self._maybe_init_routing_tables() + self._ensure_routing_tables_initialized() # Reinitialize AITER buffer if needed and parameters provided if self.num_fused_shared_experts > 0 and all( @@ -476,7 +476,7 @@ def _calculate_expert_maps(self) -> None: self._local_num_experts += self.num_fused_shared_experts - def ensure_routing_tables_initialized(self) -> None: + def _ensure_routing_tables_initialized(self) -> None: """ Ensure routing tables are initialized if needed for round-robin. @@ -497,10 +497,6 @@ def ensure_routing_tables_initialized(self) -> None: if not hasattr(self, "_routing_tables"): self._routing_tables = self._ensure_round_robin_expert_routing_tables() - def _maybe_init_routing_tables(self): - """Initialize routing tables if needed for round-robin (internal).""" - self.ensure_routing_tables_initialized() - def _ensure_round_robin_expert_routing_tables( self, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 2eee060ee735..f77c4e9b52db 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -253,11 +253,7 @@ def __init__( rocm_aiter_enabled=self.rocm_aiter_fmoe_enabled, ) - # Extract properties from ExpertMapManager - self.local_num_experts = self.expert_map_manager.local_num_experts - self.expert_placement_strategy = self.expert_map_manager.placement_strategy - self.register_buffer("_expert_map", self.expert_map_manager.expert_map) - self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) + self.update_expert_map_info() self.top_k = top_k @@ -458,9 +454,8 @@ def maybe_init_modular_kernel(self) -> None: self.ensure_moe_quant_config_init() # routing_tables only needed for round-robin expert placement with # DeepEP all2all backend. - routing_tables = self._maybe_init_expert_routing_tables() prepare_finalize = self.base_quant_method.maybe_make_prepare_finalize( - routing_tables=routing_tables + routing_tables=self._expert_routing_tables() ) if prepare_finalize is not None: logger.debug( @@ -512,7 +507,23 @@ def is_internal_router(self) -> bool: # By default, router/gate is called before FusedMoE forward pass return self.runner.is_internal_router() - def _maybe_init_expert_routing_tables( + def update_expert_map_info(self): + # Update local attributes from ExpertMapManager + self.local_num_experts = self.expert_map_manager.local_num_experts + self.expert_placement_strategy = self.expert_map_manager.placement_strategy + self.register_buffer("_expert_map", self.expert_map_manager.expert_map) + self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) + + # Get routing tables from ExpertMapManager + routing_tables = self.expert_map_manager.routing_tables + if routing_tables is not None: + # Register routing tables as buffers for this layer + global_to_physical, physical_to_global, local_global = routing_tables + self.register_buffer("expert_global_to_physical", global_to_physical) + self.register_buffer("expert_physical_to_global", physical_to_global) + self.register_buffer("expert_local_to_global", local_global) + + def _expert_routing_tables( self, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: # Return cached routing tables if already registered as buffers @@ -525,31 +536,13 @@ def _maybe_init_expert_routing_tables( self.expert_local_to_global, ), ) - - # Delegate to ExpertMapManager to initialize routing tables if needed - # (ExpertMapManager determines if routing tables are needed based on - # placement strategy and backend configuration) - self.expert_map_manager._maybe_init_routing_tables() - - # Get routing tables from ExpertMapManager - routing_tables = self.expert_map_manager.routing_tables - if routing_tables is None: - return None - - # Register routing tables as buffers for this layer - global_to_physical, physical_to_global, local_global = routing_tables - self.register_buffer("expert_global_to_physical", global_to_physical) - self.register_buffer("expert_physical_to_global", physical_to_global) - self.register_buffer("expert_local_to_global", local_global) - - return routing_tables + return None def update_expert_map(self): # ep_size and ep_rank should already be updated # Update ExpertMapManager with new EP configuration # Note: ExpertMapManager.update() recalculates expert maps and - # reinitializes routing tables internally, so no need to call - # _maybe_init_expert_routing_tables() again + # reinitializes routing tables internally. vllm_config = get_current_vllm_config() self.expert_map_manager.update( new_ep_size=self.ep_size, @@ -564,19 +557,7 @@ def update_expert_map(self): ) # Update local attributes from ExpertMapManager - self.local_num_experts = self.expert_map_manager.local_num_experts - self.expert_placement_strategy = self.expert_map_manager.placement_strategy - self.register_buffer("_expert_map", self.expert_map_manager.expert_map) - self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) - - # Update routing table buffers if they exist - # Note: Routing tables are already initialized by ExpertMapManager.update() - routing_tables = self.expert_map_manager.routing_tables - if routing_tables is not None: - global_to_physical, physical_to_global, local_global = routing_tables - self.register_buffer("expert_global_to_physical", global_to_physical) - self.register_buffer("expert_physical_to_global", physical_to_global) - self.register_buffer("expert_local_to_global", local_global) + self.update_expert_map_info() def _load_per_tensor_weight_scale( self, diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 89697033403d..8096179a3b5a 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -172,7 +172,7 @@ def _setup_kernel( moe_config=self.moe, backend=self.unquantized_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py index 629e1c5ef1be..01ffdfae0567 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py @@ -194,7 +194,7 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: experts_cls=self.experts_cls, mxfp4_backend=self.mxfp4_backend, shared_experts=layer.shared_experts, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), ) def apply( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_nvfp4.py index 29c673d0f6e3..46b7db1f0475 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_nvfp4.py @@ -236,7 +236,7 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: moe_config=self.moe, experts_cls=self.experts_cls, shared_experts=layer.shared_experts, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), ) self.moe_kernel.fused_experts.process_weights_after_loading(layer) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_fp8.py index bba7e0e7abce..433f7a5c76a7 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_fp8.py @@ -336,7 +336,7 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: moe_config=self.moe, fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_int8.py index bad5b3895b8f..d39dbee747c0 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_int8.py @@ -147,7 +147,7 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: moe_quant_config=self.moe_quant_config, moe_config=self.moe, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_mxfp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_mxfp8.py index ecd0b54890d1..219a0526c481 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_mxfp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_mxfp8.py @@ -138,7 +138,7 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: moe_config=self.moe, fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 1c9237d3f60a..58000c165947 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -774,7 +774,7 @@ def _setup_kernel( moe_config=self.moe, fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 7b6f1f9cf6cd..0156744fcc42 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -769,7 +769,7 @@ def _setup_kernel(self, layer: FusedMoE) -> None: w2_g_idx=layer.w2_g_idx, w13_g_idx_sort_indices=layer.w13_g_idx_sort_indices, w2_g_idx_sort_indices=layer.w2_g_idx_sort_indices, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 242cc105e470..f9b65b5c77e5 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -896,7 +896,7 @@ def _setup_kernel( moe_config=self.moe, fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) @@ -1419,7 +1419,7 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: moe_config=self.moe, experts_cls=self.experts_cls, shared_experts=layer.shared_experts, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), ) self.moe_kernel.fused_experts.process_weights_after_loading(layer) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 0a516831c4ec..f8c7711ccb4e 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -364,7 +364,7 @@ def _setup_kernel( moe_config=self.moe, mxfp4_backend=self.mxfp4_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) @@ -690,7 +690,7 @@ def _setup_kernel( moe_config=self.moe, mxfp4_backend=self.mxfp4_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/online/fp8.py b/vllm/model_executor/layers/quantization/online/fp8.py index 9cb697289d7e..dfcff1e21685 100644 --- a/vllm/model_executor/layers/quantization/online/fp8.py +++ b/vllm/model_executor/layers/quantization/online/fp8.py @@ -348,7 +348,7 @@ def _setup_kernel( moe_config=self.moe, fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/online/int8.py b/vllm/model_executor/layers/quantization/online/int8.py index 4b4c87fbce96..f4d2f9a2a371 100644 --- a/vllm/model_executor/layers/quantization/online/int8.py +++ b/vllm/model_executor/layers/quantization/online/int8.py @@ -99,7 +99,7 @@ def _setup_kernel(self, layer: "FusedMoE") -> None: moe_quant_config=self.moe_quant_config, moe_config=self.moe, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/online/mxfp8.py b/vllm/model_executor/layers/quantization/online/mxfp8.py index 39a32604442c..312da8a12158 100644 --- a/vllm/model_executor/layers/quantization/online/mxfp8.py +++ b/vllm/model_executor/layers/quantization/online/mxfp8.py @@ -199,7 +199,7 @@ def _setup_kernel( moe_config=self.moe, fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index d92acb85c265..9d9397e29f8d 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -1350,7 +1350,7 @@ def _setup_kernel_via_oracle(self, layer: FusedMoE): moe_config=self.moe, mxfp4_backend=self.mxfp4_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) From 778c141a79e3db62ac6d1aa0bc71b5cfef6db7f5 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 May 2026 19:56:26 +0000 Subject: [PATCH 150/191] fix local_num_experts Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/expert_map_manager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 79b1f0aa70b7..f7c6f88077a1 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -455,7 +455,9 @@ def _calculate_expert_maps(self) -> None: """Calculate expert mappings based on placement strategy.""" if self.ep_size == 1: # No EP, all experts are local - self._local_num_experts = self.global_num_experts + self._local_num_experts = ( + self.global_num_experts + self.num_fused_shared_experts + ) self._expert_map = None self._expert_mask = None return From e69a21304ef2abd77809440d4b542623442c886b Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 May 2026 21:51:24 +0000 Subject: [PATCH 151/191] fix device stuff Signed-off-by: Bill Nell --- .../modular_kernel_tools/parallel_utils.py | 1 + tests/kernels/moe/test_moe_layer.py | 12 ---- .../layers/fused_moe/expert_map_manager.py | 72 ++++++++++--------- 3 files changed, 38 insertions(+), 47 deletions(-) diff --git a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py index ceedab9e97ed..b9e68b46e38e 100644 --- a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py +++ b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py @@ -79,6 +79,7 @@ def _worker_parallel_launch( rank = node_rank * world_local_size + local_rank device = torch.device("cuda", local_rank) torch.accelerator.set_device_index(device) + torch.set_default_device(device) torch.distributed.init_process_group( backend="cpu:gloo,cuda:nccl", init_method=init_method, diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index e359139c302e..ab304d75b78c 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -1013,7 +1013,6 @@ def make_fake_moe_layer( gate: torch.nn.Module | None = None, routed_input_transform: torch.nn.Module | None = None, routed_output_transform: torch.nn.Module | None = None, - # enable_eplb: bool = False, use_ep: bool = False, tp_size: int = 1, dp_size: int = 1, @@ -1022,10 +1021,6 @@ def make_fake_moe_layer( quant_dtype = None activation = MoEActivation.from_str(activation) - # eplb_manager: EplbManager | None = None - # if enable_eplb: - # eplb_manager = EplbManager(num_redundant_experts=num_redundant_experts) - router = create_fused_moe_router( top_k=top_k, global_num_experts=global_num_experts, @@ -1038,7 +1033,6 @@ def make_fake_moe_layer( routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, num_fused_shared_experts=0, # TODO - # eplb_manager=eplb_manager, ) if quant_dtype is not None: @@ -1212,9 +1206,6 @@ def _test_body_eplb( routed_output_transform=routed_output_transform, ) - # if eplb_moe_layer._expert_map is not None: - # eplb_moe_layer._expert_map = eplb_moe_layer._expert_map.to(device) - # All ranks must generate the same permutation initial_indices = torch.arange(num_experts, dtype=torch.long) shuffled_indices = initial_indices[torch.randperm(num_experts)] @@ -1404,9 +1395,6 @@ def _run_one_config( activation=activation, ) - # if moe_layer._expert_map is not None: - # moe_layer._expert_map = moe_layer._expert_map.to(device) - num_tokens = m num_tokens_across_dp = torch.tensor( [num_tokens] * world_size, diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index ba964e1ab7b7..6fb5269be4ed 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -173,7 +173,6 @@ def __init__( enable_eplb: bool, num_fused_shared_experts: int = 0, rocm_aiter_enabled: bool = False, - device: torch.device | None = None, ): """ Initialize expert map manager. @@ -186,14 +185,12 @@ def __init__( placement_strategy: Strategy for placing experts ('linear' or 'round_robin') num_fused_shared_experts: Number of fused shared experts (for AITER) rocm_aiter_enabled: Whether ROCm AITER fusion is enabled - device: Device for tensor allocations """ self.global_num_experts = global_num_experts self.logical_num_experts = logical_num_experts self.moe_parallel_config = moe_parallel_config self.num_fused_shared_experts = num_fused_shared_experts self.rocm_aiter_enabled = rocm_aiter_enabled - self.device = device if moe_parallel_config.use_ep: # Determine expert placement strategy before creating manager @@ -244,6 +241,15 @@ def __init__( self.get_compressed_map_string(), ) + @property + def device(self) -> torch.device: + if self._expert_map is not None: + return self._expert_map.device + elif self._expert_mask is not None: + return self._expert_mask.device + else: + raise RuntimeError("no device found") + def _init_aiter_shared_experts_topK_buffer( self, dp_size: int, @@ -380,28 +386,29 @@ def update( max_num_batched_tokens: New max batched tokens (if changed, for AITER buffer reinitialization) """ - if new_ep_size is not None: - self.moe_parallel_config.ep_size = new_ep_size - if new_ep_rank is not None: - self.moe_parallel_config.ep_rank = new_ep_rank - - # Recalculate everything - self._placement_strategy = self._determine_placement_strategy( - self._placement_strategy - ) + with self.device: + if new_ep_size is not None: + self.moe_parallel_config.ep_size = new_ep_size + if new_ep_rank is not None: + self.moe_parallel_config.ep_rank = new_ep_rank + + # Recalculate everything + self._placement_strategy = self._determine_placement_strategy( + self._placement_strategy + ) - self._calculate_expert_maps() - self._maybe_init_routing_tables() + self._calculate_expert_maps() + self._maybe_init_routing_tables() - # Reinitialize AITER buffer if needed and parameters provided - if self.num_fused_shared_experts > 0 and all( - x is not None for x in [dp_size, top_k, max_num_batched_tokens] - ): - self._init_aiter_shared_experts_topK_buffer( - dp_size=dp_size, # type: ignore - top_k=top_k, # type: ignore - max_num_batched_tokens=max_num_batched_tokens, # type: ignore - ) + # Reinitialize AITER buffer if needed and parameters provided + if self.num_fused_shared_experts > 0 and all( + x is not None for x in [dp_size, top_k, max_num_batched_tokens] + ): + self._init_aiter_shared_experts_topK_buffer( + dp_size=dp_size, # type: ignore + top_k=top_k, # type: ignore + max_num_batched_tokens=max_num_batched_tokens, # type: ignore + ) def get_compressed_map_string(self) -> str: """ @@ -469,13 +476,6 @@ def _calculate_expert_maps(self) -> None: self._local_num_experts += self.num_fused_shared_experts - # Move to device if specified - if self.device is not None: - if self._expert_map is not None: - self._expert_map = self._expert_map.to(self.device) - if self._expert_mask is not None: - self._expert_mask = self._expert_mask.to(self.device) - def ensure_routing_tables_initialized(self) -> None: """ Ensure routing tables are initialized if needed for round-robin. @@ -509,10 +509,10 @@ def _ensure_round_robin_expert_routing_tables( "Round robin not supported for AITER." ) - device_kwargs = {"device": self.device} if self.device is not None else {} - global_indices = torch.arange( - self.global_num_experts, dtype=torch.long, **device_kwargs + self.global_num_experts, + dtype=torch.long, + device=self.device, ) owner = torch.remainder(global_indices, self.ep_size) local_index = torch.div(global_indices, self.ep_size, rounding_mode="floor") @@ -523,7 +523,9 @@ def _ensure_round_robin_expert_routing_tables( if remainder > 0: remainder_tensor = torch.tensor( - remainder, dtype=torch.long, **device_kwargs + remainder, + dtype=torch.long, + device=self.device, ) physical_offset = physical_offset + torch.minimum(owner, remainder_tensor) @@ -536,7 +538,7 @@ def _ensure_round_robin_expert_routing_tables( self.global_num_experts, self.ep_size, dtype=torch.long, - **device_kwargs, + device=self.device, ) if local_global.numel() != self._local_num_experts: local_global = local_global[: self._local_num_experts] From 3c21f32381158c3037d24e15e90942c525e43a85 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 May 2026 22:00:21 +0000 Subject: [PATCH 152/191] tweak Signed-off-by: Bill Nell --- .../layers/fused_moe/expert_map_manager.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index f7c6f88077a1..c5c02bd6c512 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -70,6 +70,7 @@ def determine_expert_map( # Create a tensor of size num_experts filled with -1 expert_map = torch.full((global_num_experts,), -1, dtype=torch.int32) + # Create an expert map for the local experts if expert_placement_strategy == "linear": start_idx = ep_rank * base_experts + min(ep_rank, remainder) @@ -507,13 +508,10 @@ def _ensure_round_robin_expert_routing_tables( "Round robin not supported for AITER." ) - assert self._expert_map is not None - device = self._expert_map.device - global_indices = torch.arange( self.global_num_experts, dtype=torch.long, - device=device, + device=self.device, ) owner = torch.remainder(global_indices, self.ep_size) local_index = torch.div(global_indices, self.ep_size, rounding_mode="floor") @@ -526,7 +524,7 @@ def _ensure_round_robin_expert_routing_tables( remainder_tensor = torch.tensor( remainder, dtype=torch.long, - device=device, + device=self.device, ) physical_offset = physical_offset + torch.minimum(owner, remainder_tensor) @@ -539,7 +537,7 @@ def _ensure_round_robin_expert_routing_tables( self.global_num_experts, self.ep_size, dtype=torch.long, - device=device, + device=self.device, ) if local_global.numel() != self._local_num_experts: local_global = local_global[: self._local_num_experts] From 2dd2ea9219b5d741fe811571b5d686cf9e0397d4 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 May 2026 22:49:08 +0000 Subject: [PATCH 153/191] try to fix update_expert_map Signed-off-by: Bill Nell --- .../layers/fused_moe/expert_map_manager.py | 60 +++++-------------- vllm/model_executor/layers/fused_moe/layer.py | 23 +++---- 2 files changed, 24 insertions(+), 59 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index c5c02bd6c512..6f89dc387eb9 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -166,7 +166,6 @@ def __init__( max_num_batched_tokens: int, top_k: int, global_num_experts: int, - logical_num_experts: int, num_redundant_experts: int, num_expert_group: int | None, moe_parallel_config: FusedMoEParallelConfig, @@ -180,7 +179,6 @@ def __init__( Args: global_num_experts: Total number of experts across all ranks - logical_num_experts: Number of logical (non-redundant) experts moe_parallel_config: MoE parallel configuration (contains ep_size, ep_rank, backend flags) placement_strategy: Strategy for placing experts ('linear' or 'round_robin') @@ -188,10 +186,11 @@ def __init__( rocm_aiter_enabled: Whether ROCm AITER fusion is enabled """ self.global_num_experts = global_num_experts - self.logical_num_experts = logical_num_experts self.moe_parallel_config = moe_parallel_config self.num_fused_shared_experts = num_fused_shared_experts self.rocm_aiter_enabled = rocm_aiter_enabled + self.top_k = top_k + self.max_num_batched_tokens = max_num_batched_tokens if moe_parallel_config.use_ep: # Determine expert placement strategy before creating manager @@ -215,11 +214,7 @@ def __init__( # Initialize routing tables if needed self._ensure_routing_tables_initialized() - self._init_aiter_shared_experts_topK_buffer( - dp_size=self.moe_parallel_config.dp_size, - top_k=top_k, - max_num_batched_tokens=max_num_batched_tokens, - ) + self._init_aiter_shared_experts_topK_buffer() if self.use_ep and self.rocm_aiter_enabled: expert_mask = self.expert_mask @@ -251,21 +246,17 @@ def device(self) -> torch.device: else: raise RuntimeError("no device available") - def _init_aiter_shared_experts_topK_buffer( - self, - dp_size: int, - top_k: int, - max_num_batched_tokens: int, - ): + def _init_aiter_shared_experts_topK_buffer(self): if self.num_fused_shared_experts > 0: + dp_size = self.moe_parallel_config.dp_size init_aiter_topK_meta_data( n_routed_experts=self.global_num_experts, n_shared_experts=self.num_fused_shared_experts, - top_k=top_k, + top_k=self.top_k, tp_rank=self.ep_rank if self.use_ep else self.tp_rank, tp_size=self.ep_size if self.use_ep else self.tp_size, shared_experts_score=1.0, - max_num_tokens=max_num_batched_tokens * dp_size, + max_num_tokens=self.max_num_batched_tokens * dp_size, is_EP=self.use_ep, ) @@ -368,11 +359,9 @@ def get_local_expert_ids(self) -> list[int]: def update( self, - new_ep_size: int | None = None, - new_ep_rank: int | None = None, - dp_size: int | None = None, - top_k: int | None = None, - max_num_batched_tokens: int | None = None, + moe_parallel_config: FusedMoEParallelConfig, + global_num_experts: int, + num_fused_shared_experts: int, ) -> None: """ Update expert mappings for new EP configuration. @@ -380,36 +369,17 @@ def update( Used during dynamic reconfiguration (e.g., elastic scaling). Args: - new_ep_size: New EP world size (if changed) - new_ep_rank: New EP rank (if changed) - dp_size: New DP size (if changed, for AITER buffer reinitialization) - top_k: New top_k (if changed, for AITER buffer reinitialization) - max_num_batched_tokens: New max batched tokens (if changed, for AITER - buffer reinitialization) """ - with self.device: - if new_ep_size is not None: - self.moe_parallel_config.ep_size = new_ep_size - if new_ep_rank is not None: - self.moe_parallel_config.ep_rank = new_ep_rank - - # Recalculate everything - self._placement_strategy = self._determine_placement_strategy( - self._placement_strategy - ) + self.moe_parallel_config = moe_parallel_config + self.global_num_experts = global_num_experts + self.num_fused_shared_experts = num_fused_shared_experts + with self.device: self._calculate_expert_maps() self._ensure_routing_tables_initialized() # Reinitialize AITER buffer if needed and parameters provided - if self.num_fused_shared_experts > 0 and all( - x is not None for x in [dp_size, top_k, max_num_batched_tokens] - ): - self._init_aiter_shared_experts_topK_buffer( - dp_size=dp_size, # type: ignore - top_k=top_k, # type: ignore - max_num_batched_tokens=max_num_batched_tokens, # type: ignore - ) + self._init_aiter_shared_experts_topK_buffer() def get_compressed_map_string(self) -> str: """ diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 49473d6316c0..16b95635ac82 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -238,12 +238,13 @@ def __init__( "Redundant experts are only supported with EPLB." ) + max_num_batched_tokens = vllm_config.scheduler_config.max_num_batched_tokens + # Create ExpertMapManager to handle expert mapping and placement self.expert_map_manager = ExpertMapManager( - max_num_batched_tokens=vllm_config.scheduler_config.max_num_batched_tokens, + max_num_batched_tokens=max_num_batched_tokens, top_k=top_k, global_num_experts=self.global_num_experts, - logical_num_experts=self.logical_num_experts, num_redundant_experts=num_redundant_experts, num_expert_group=num_expert_group, moe_parallel_config=self.moe_parallel_config, @@ -323,7 +324,7 @@ def __init__( in_dtype=moe_in_dtype, moe_backend=vllm_config.kernel_config.moe_backend, router_logits_dtype=router_logits_dtype, - max_num_tokens=vllm_config.scheduler_config.max_num_batched_tokens, + max_num_tokens=max_num_batched_tokens, has_bias=has_bias, is_act_and_mul=is_act_and_mul, is_lora_enabled=vllm_config.lora_config is not None, @@ -541,21 +542,15 @@ def _expert_routing_tables( return None def update_expert_map(self): - # ep_size and ep_rank should already be updated # Update ExpertMapManager with new EP configuration + # The moe_parallel_config (including ep_size and ep_rank) + # should already be updated. # Note: ExpertMapManager.update() recalculates expert maps and # reinitializes routing tables internally. - vllm_config = get_current_vllm_config() self.expert_map_manager.update( - new_ep_size=self.ep_size, - new_ep_rank=self.ep_rank, - dp_size=get_dp_group().world_size - if self.aiter_fmoe_shared_expert_enabled - else None, - top_k=self.top_k if self.aiter_fmoe_shared_expert_enabled else None, - max_num_batched_tokens=vllm_config.scheduler_config.max_num_batched_tokens - if self.aiter_fmoe_shared_expert_enabled - else None, + self.moe_parallel_config, + global_num_experts=self.global_num_experts, + num_fused_shared_experts=self.num_fused_shared_experts, ) # Update local attributes from ExpertMapManager From 979dd651b3c2a4d8bab95b9317822551cb51c44c Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 May 2026 23:00:39 +0000 Subject: [PATCH 154/191] remove unused arg Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/expert_map_manager.py | 2 -- vllm/model_executor/layers/fused_moe/layer.py | 1 - 2 files changed, 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 6f89dc387eb9..de48a887c431 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -361,7 +361,6 @@ def update( self, moe_parallel_config: FusedMoEParallelConfig, global_num_experts: int, - num_fused_shared_experts: int, ) -> None: """ Update expert mappings for new EP configuration. @@ -372,7 +371,6 @@ def update( """ self.moe_parallel_config = moe_parallel_config self.global_num_experts = global_num_experts - self.num_fused_shared_experts = num_fused_shared_experts with self.device: self._calculate_expert_maps() diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 16b95635ac82..d2f87e4eff3f 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -550,7 +550,6 @@ def update_expert_map(self): self.expert_map_manager.update( self.moe_parallel_config, global_num_experts=self.global_num_experts, - num_fused_shared_experts=self.num_fused_shared_experts, ) # Update local attributes from ExpertMapManager From 56d87385ffb0e693012993a8c8bd1ccdf6bd5a53 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 May 2026 23:02:21 +0000 Subject: [PATCH 155/191] update comment Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/expert_map_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index de48a887c431..801de110137d 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -368,6 +368,9 @@ def update( Used during dynamic reconfiguration (e.g., elastic scaling). Args: + global_num_experts: New total number of experts across all ranks + moe_parallel_config: New MoE parallel configuration (contains ep_size, + ep_rank, backend flags) """ self.moe_parallel_config = moe_parallel_config self.global_num_experts = global_num_experts From 34652de9dd4e0c71ea6f5636611bb7d6a391ab89 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 7 May 2026 02:11:09 +0000 Subject: [PATCH 156/191] fix test Signed-off-by: Bill Nell --- tests/kernels/moe/test_moe_layer.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index ab304d75b78c..f292f7cd803b 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -404,11 +404,6 @@ def is_valid_config(config: MoETestConfig) -> tuple[bool, str | None]: "leads to large differences.", ) - # gate requires shared_experts (use_overlapped mode) - # TODO: also not sure this is true - if config.use_gate and not config.use_shared_experts: - return False, "gate requires shared_experts (use_overlapped mode)" - # Skip modelopt_fp4 if not on B100+ (compute capability 10.0+) if ( config.quantization == "modelopt_fp4" @@ -1251,7 +1246,7 @@ def _test_body_eplb( ), ) - eplb_moe_layer.eplb_state.should_record_tensor = torch.ones( + eplb_moe_layer.router.eplb_state.should_record_tensor = torch.ones( (), dtype=torch.bool, device=device ) From f9806c1f3aa70fd1a59779ae06675a349792c639 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 7 May 2026 04:01:32 +0000 Subject: [PATCH 157/191] fix lint Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 65 ++++++++----- .../layers/fused_moe/routed_experts.py | 14 ++- .../model_executor/models/transformers/moe.py | 91 ++++++++++++------- 3 files changed, 108 insertions(+), 62 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 09254c706660..78372d2bb734 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -24,11 +24,15 @@ ExpertMapManager, ) from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts +from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( + FusedMoERouter, +) from vllm.model_executor.layers.fused_moe.router.router_factory import ( create_fused_moe_router, ) from vllm.model_executor.layers.fused_moe.runner.moe_runner import ( MoERunner, + MoERunnerInterface, ) from vllm.model_executor.layers.fused_moe.utils import ( disable_inplace, @@ -131,6 +135,7 @@ def FusedMoE( pcp_size: int | None = None, prefix: str = "", custom_routing_function: Callable | None = None, + router: FusedMoERouter | None = None, scoring_func: str = "softmax", routed_scaling_factor: float = 1.0, swiglu_limit: float | None = None, @@ -151,6 +156,8 @@ def FusedMoE( apply_routed_scale_to_output: bool = False, zero_expert_type: str | None = None, hash_indices_table: torch.Tensor | None = None, + runner_cls: type[MoERunnerInterface] | None = None, + runner_args: dict[str, Any] | None = None, routed_experts_cls: type[RoutedExperts] | None = None, routed_experts_args: dict[str, Any] | None = None, ) -> MoERunner: @@ -241,30 +248,31 @@ def FusedMoE( # TODO(bnell): we should not have to create a router if the kernel is # monolithic. - router = create_fused_moe_router( - top_k=top_k, - global_num_experts=global_num_experts, - renormalize=renormalize, - use_grouped_topk=use_grouped_topk, - num_expert_group=num_expert_group, - topk_group=topk_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - # When apply_routed_scale_to_output is True, we set the scaling factor - # to 1.0 so it ends up being a nop. Applying the scale will be handled - # by the runner in this case. - # The member variable must be set in the same way as the router since - # some quantization methods can access it. - routed_scaling_factor=routed_scaling_factor - if not apply_routed_scale_to_output - else 1.0, - e_score_correction_bias=e_score_correction_bias, - num_fused_shared_experts=num_fused_shared_experts, - eplb_state=eplb_state, - zero_expert_type=zero_expert_type, - num_logical_experts=logical_num_experts, - hash_indices_table=hash_indices_table, - ) + if router is None: + router = create_fused_moe_router( + top_k=top_k, + global_num_experts=global_num_experts, + renormalize=renormalize, + use_grouped_topk=use_grouped_topk, + num_expert_group=num_expert_group, + topk_group=topk_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + # When apply_routed_scale_to_output is True, we set the scaling factor + # to 1.0 so it ends up being a nop. Applying the scale will be handled + # by the runner in this case. + # The member variable must be set in the same way as the router since + # some quantization methods can access it. + routed_scaling_factor=routed_scaling_factor + if not apply_routed_scale_to_output + else 1.0, + e_score_correction_bias=e_score_correction_bias, + num_fused_shared_experts=num_fused_shared_experts, + eplb_state=eplb_state, + zero_expert_type=zero_expert_type, + num_logical_experts=logical_num_experts, + hash_indices_table=hash_indices_table, + ) # TODO: move this??????????? is this even needed??? # When using zero experts, slice e_score_correction_bias to cover @@ -318,6 +326,7 @@ def FusedMoE( moe_config, quant_config, expert_map_manager=expert_map_manager, + expert_mapping=expert_mapping, # Extra params that are needed by quant_methods, pass along for now top_k=top_k, # can get from moe_config use_grouped_topk=use_grouped_topk, @@ -334,7 +343,10 @@ def FusedMoE( **routed_experts_args if routed_experts_args is not None else {}, ) - runner = MoERunner( + if runner_cls is None: + runner_cls = MoERunner + + runner = runner_cls( layer_name=layer_name, moe_config=moe_config, router=router, @@ -350,6 +362,7 @@ def FusedMoE( routed_scaling_factor=routed_scaling_factor if apply_routed_scale_to_output else 1.0, + **runner_args if runner_args is not None else {}, ) # For smuggling this layer into the fused moe custom op @@ -365,6 +378,7 @@ def fused_moe_make_expert_params_mapping( ckpt_up_proj_name: str, num_experts: int, num_redundant_experts: int = 0, + routed_experts_prefix: str = "routed_experts", ) -> list[tuple[str, str, int, str]]: """Delegate to EPLB manager.""" return RoutedExperts.make_expert_params_mapping( @@ -374,4 +388,5 @@ def fused_moe_make_expert_params_mapping( ckpt_up_proj_name, num_experts, num_redundant_experts, + routed_experts_prefix, ) diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index d05088a24334..6a0c1fd17649 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -60,12 +60,14 @@ def __init__( moe_config: FusedMoEConfig, quant_config: QuantizationConfig | None, expert_map_manager: ExpertMapManager, + expert_mapping: list[tuple[str, str, int, str]] | None = None, **kwargs, ): super().__init__() self.layer_name = layer_name self.moe_config = moe_config self.quant_config = quant_config + self.expert_mapping = expert_mapping self.expert_map_manager = expert_map_manager self.hidden_size = moe_config.hidden_dim self.global_num_experts = moe_config.num_experts @@ -869,6 +871,7 @@ def make_expert_params_mapping( ckpt_up_proj_name: str, num_experts: int, num_redundant_experts: int = 0, + routed_experts_prefix: str = "routed_experts", ) -> list[tuple[str, str, int, str]]: """ Create expert parameter mapping for weight loading with redundant experts. @@ -910,13 +913,16 @@ def make_expert_params_mapping( else "" ) + if routed_experts_prefix != "": + routed_experts_prefix = f"{routed_experts_prefix}." + return [ # (param_name, weight_name, expert_id, shard_id) ( - f"experts.routed_experts.{base_layer}w13_" + f"experts.{routed_experts_prefix}{base_layer}w13_" if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name] - else f"experts.routed_experts.{base_layer}w2_", - f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}", + else f"experts.{routed_experts_prefix}{base_layer}w2_", + f"experts.{routed_experts_prefix}{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}", expert_id, shard_id, ) @@ -981,7 +987,7 @@ def _maybe_make_contiguous( "w2_input_scale", } - # Parameters of non-expert submodules that live inside runner (MoERunner). + # Parameters of non-expert submodules that live inside runner (RoutedExperts). # These must be excluded from EPLB weight rearrangement. NON_EXPERT_PREFIXES = () diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index f7ee7cf27722..b74a255b82ea 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -16,6 +16,9 @@ # limitations under the License. """Transformers modeling backend mixin for Mixture of Experts (MoE) models.""" +from collections.abc import Iterable +from dataclasses import dataclass +from functools import partial from typing import TYPE_CHECKING, Any import torch @@ -41,38 +44,22 @@ from vllm.config import VllmConfig +@dataclass +class TransformersMoEState: + topk_ids: torch.Tensor | None = None + is_sequence_parallel: bool = False + + # --8<-- [start:transformers_fused_moe] @PluggableLayer.register("transformers_fused_moe") class TransformersFusedMoE(MoERunner): """Custom FusedMoE for the Transformers modeling backend.""" # --8<-- [end:transformers_fused_moe] - - def __init__(self, *args, **kwargs): - self._topk_ids: torch.Tensor = None - - def custom_routing_function(hidden_states, gating_output, topk, renormalize): - """Return `topk_weights` from `gating_output` and the - `topk_ids` we stored in the layer earlier.""" - topk_weights = gating_output - topk_ids = self._topk_ids - # Handle all gather in expert parallel - if topk_ids.size(0) != hidden_states.size(0): - dp_metadata = get_forward_context().dp_metadata - sizes = dp_metadata.get_chunk_sizes_across_dp_rank() - is_sp = self.is_sequence_parallel - dist_group = get_ep_group() if is_sp else get_dp_group() - assert sizes[dist_group.rank_in_group] == topk_ids.shape[0] - (topk_ids,) = dist_group.all_gatherv([topk_ids], 0, sizes) - return topk_weights, topk_ids - - kwargs["custom_routing_function"] = custom_routing_function - self.runner = FusedMoE(*args, **kwargs) - - def __getattr__(self, name): - # Delegate attribute access to the originalr runner. This is only - # called when normal lookup (instance __dict__, class MRO) fails, - return getattr(self.runner, name) + def __init__(self, *args, moe_state: TransformersMoEState, **kwargs): + super().__init__(*args, **kwargs) + self.moe_state = moe_state + self.moe_state.is_sequence_parallel = self.moe_config.is_sequence_parallel def forward( self, @@ -84,12 +71,19 @@ def forward( """In Transformers `experts.forward` will have this signature. We discard any extra kwargs because we cannot use them here.""" - return torch.ops.vllm.transformers_moe_forward( - hidden_states, - topk_ids.to(torch.int32), - topk_weights.to(torch.float32), - self.layer_name, - ) + + self.moe_state.topk_ids = topk_ids.to(torch.int32) + topk_weights = topk_weights.to(torch.float32) + + # Clone hidden_states because it will be mutated in-place in FusedMoE + # TODO(bnell): figure out a way to avoid calling runner directly. + # it is a hack that the weight are being passed via logits. + return super().forward(hidden_states.clone(), topk_weights) + + def load_weights( + self, weights: Iterable[tuple[str, torch.Tensor]] + ) -> Iterable[str]: + return self.routed_experts.load_weights(weights) def transformers_moe_forward( @@ -195,6 +189,7 @@ def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: ckpt_up_proj_name=up_proj, num_experts=num_experts, num_redundant_experts=num_redundant_experts, + routed_experts_prefix="", ) ) return expert_mapping @@ -292,8 +287,32 @@ def _recursive_replace(module: nn.Module, prefix: str): if "shared_expert" in mlp_param_name: self.num_shared_experts = 1 break + # Replace experts module with FusedMoE - fused_experts = TransformersFusedMoE( + moe_state = TransformersMoEState() + + def custom_routing_function( + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + moe_state: TransformersMoEState, + ): + """Return `topk_weights` from `gating_output` and the + `topk_ids` we stored in the layer earlier.""" + topk_weights = gating_output + topk_ids = moe_state.topk_ids + # Handle all gather in expert parallel + if topk_ids.size(0) != hidden_states.size(0): + dp_metadata = get_forward_context().dp_metadata + sizes = dp_metadata.get_chunk_sizes_across_dp_rank() + is_sp = moe_state.is_sequence_parallel + dist_group = get_ep_group() if is_sp else get_dp_group() + assert sizes[dist_group.rank_in_group] == topk_ids.shape[0] + (topk_ids,) = dist_group.all_gatherv([topk_ids], 0, sizes) + return topk_weights, topk_ids + + fused_experts = FusedMoE( num_experts=num_experts, top_k=top_k, hidden_size=hidden_size, @@ -310,6 +329,12 @@ def _recursive_replace(module: nn.Module, prefix: str): num_redundant_experts=num_redundant_experts, has_bias=has_bias, expert_mapping=expert_mapping, + custom_routing_function=partial( + custom_routing_function, + moe_state=moe_state, + ), + runner_cls=TransformersFusedMoE, + runner_args={"moe_state": moe_state}, ) mlp.experts = fused_experts log_replacement(qual_name, experts, fused_experts) From dd14e4e6b14a03ffc1fe711dd40435bd8940525a Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 7 May 2026 14:02:51 +0000 Subject: [PATCH 158/191] fix lora Signed-off-by: Bill Nell --- vllm/lora/layers/fused_moe.py | 83 ++++++++++--------- vllm/lora/model_manager.py | 1 - .../layers/fused_moe/routed_experts.py | 2 +- .../model_executor/models/transformers/moe.py | 1 + 4 files changed, 48 insertions(+), 39 deletions(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 99e14ee697cb..b3bdfc70b919 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -13,7 +13,7 @@ ) from vllm.distributed.utils import divide from vllm.lora.layers.base import BaseLayerWithLoRA -from vllm.model_executor.layers.fused_moe import MoERunner, RoutedExperts +from vllm.model_executor.layers.fused_moe import MoERunner from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import ( FusedMoEModularMethod, ) @@ -26,16 +26,17 @@ from .utils import _get_lora_device -# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX class FusedMoEWithLoRA(BaseLayerWithLoRA): def __init__(self, base_layer: MoERunner) -> None: super().__init__() self.base_layer = base_layer + self._shared_experts = base_layer._shared_experts + routed_experts = self.base_layer.routed_experts - assert not self.routed_experts.use_ep, ( + assert not routed_experts.use_ep, ( "EP support for Fused MoE LoRA is not implemented yet." ) - assert not self.routed_experts.quant_method.is_monolithic, ( + assert not routed_experts.quant_method.is_monolithic, ( "Monolithic kernels are not supported for Fused MoE LoRA." ) self.tp_size = get_tensor_model_parallel_world_size() @@ -45,15 +46,15 @@ def __init__(self, base_layer: MoERunner) -> None: # since there's only up_proj (w1), not gate_proj + up_proj (w1 + w3) self._w13_slices = 2 if base_layer.moe_config.is_act_and_mul else 1 - self.routed_experts.ensure_moe_quant_config_init() - if getattr(self.routed_experts.quant_method, "supports_internal_mk", False): - moe_kernel = self.routed_experts.quant_method.moe_kernel + routed_experts._ensure_moe_quant_config_init() + if getattr(routed_experts.quant_method, "supports_internal_mk", False): + moe_kernel = routed_experts.quant_method.moe_kernel else: prepare_finalize = MoEPrepareAndFinalizeNoDPEPModular() moe_kernel = FusedMoEKernel( prepare_finalize, - self.routed_experts.quant_method.select_gemm_impl( - prepare_finalize, self.routed_experts + routed_experts.quant_method.select_gemm_impl( + prepare_finalize, routed_experts ), ) assert moe_kernel.supports_lora(), ( @@ -65,15 +66,23 @@ def __init__(self, base_layer: MoERunner) -> None: ) self._fused_experts = moe_kernel.fused_experts self.base_layer._replace_quant_method( - FusedMoEModularMethod(self.base_layer.quant_method, moe_kernel) + FusedMoEModularMethod(self.base_layer._quant_method, moe_kernel) ) @property - def routed_experts(self) -> RoutedExperts: - return self.base_layer.routed_experts + def hidden_size(self) -> int: + return self.base_layer.moe_config.hidden_dim + + @property + def local_num_experts(self) -> int: + return self.base_layer.moe_config.num_local_experts + + @property + def intermediate_size_per_partition(self) -> int: + return self.base_layer.moe_config.intermediate_size_per_partition def _build_lora_context(self): - moe_config = self.routed_experts.moe_config + moe_config = self.base_layer.moe_config return MoELoRAContext( w13_lora_a_stacked=self.w13_lora_a_stacked, w13_lora_b_stacked=self.w13_lora_b_stacked, @@ -86,7 +95,7 @@ def _build_lora_context(self): fully_sharded=self.fully_sharded, tp_rank=self.tp_rank, tp_size=self.tp_size, - local_num_experts=moe_config.num_local_experts, + local_num_experts=self.local_num_experts, punica_wrapper=self.punica_wrapper, use_tuned_config=bool(envs.VLLM_TUNED_CONFIG_FOLDER), ) @@ -100,11 +109,11 @@ def _create_lora_a_weights( torch.zeros( ( max_loras, - self.routed_experts.local_num_experts, + self.local_num_experts, lora_config.max_lora_rank if not self.fully_sharded else divide(lora_config.max_lora_rank, self.tp_size), - self.routed_experts.hidden_size, + self.hidden_size, ), dtype=lora_config.lora_dtype, device=self.device, @@ -115,9 +124,9 @@ def _create_lora_a_weights( torch.zeros( ( max_loras, - self.routed_experts.local_num_experts, + self.local_num_experts, lora_config.max_lora_rank, - self.routed_experts.intermediate_size_per_partition, + self.intermediate_size_per_partition, ), dtype=lora_config.lora_dtype, device=self.device, @@ -129,8 +138,8 @@ def _create_lora_b_weights(self, max_loras: int, lora_config: LoRAConfig): torch.zeros( ( max_loras, - self.routed_experts.local_num_experts, - self.routed_experts.intermediate_size_per_partition, + self.local_num_experts, + self.intermediate_size_per_partition, lora_config.max_lora_rank, ), dtype=lora_config.lora_dtype, @@ -142,10 +151,10 @@ def _create_lora_b_weights(self, max_loras: int, lora_config: LoRAConfig): torch.zeros( ( max_loras, - self.routed_experts.local_num_experts, - self.routed_experts.hidden_size + self.local_num_experts, + self.hidden_size if not self.fully_sharded - else divide(self.routed_experts.hidden_size, self.tp_size), + else divide(self.hidden_size, self.tp_size), lora_config.max_lora_rank, ), dtype=lora_config.lora_dtype, @@ -175,7 +184,7 @@ def create_lora_weights( self.lora_a_stacked = [] self.lora_b_stacked = [] for lora_id in range(max_loras): - for experts_id in range(self.routed_experts.local_num_experts): + for experts_id in range(self.local_num_experts): # For gated MoE: gate_proj (w1), down_proj (w2), up_proj (w3) # For non-gated MoE: up_proj (w1), down_proj (w2) self.lora_a_stacked.append( @@ -222,7 +231,7 @@ def _slice_w13_b(self, w13_lora_b: torch.Tensor): return w13_lora_b # w13_lora_b shape (num_experts,output_size,rank) - shard_size = self.routed_experts.intermediate_size_per_partition + shard_size = self.intermediate_size_per_partition start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size @@ -235,7 +244,7 @@ def _slice_w2_a(self, w2_lora_a: torch.Tensor) -> torch.Tensor: if self.tp_size == 1: return w2_lora_a # w2_lora_a shape (num_experts,rank,input_size) - shard_size = self.routed_experts.intermediate_size_per_partition + shard_size = self.intermediate_size_per_partition start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size @@ -336,7 +345,7 @@ def forward(self, *args, **kwargs): @property def quant_method(self): - return self.routed_experts.quant_method + return self.base_layer._quant_method @property def is_internal_router(self) -> bool: @@ -352,12 +361,12 @@ def can_replace_layer( ) -> bool: """Returns True if the layer can be replaced by this LoRA layer.""" - # source_layer is FusedMoE + # source_layer is MoERunner return isinstance(source_layer, MoERunner) and len(packed_modules_list) == 2 class FusedMoE3DWithLoRA(FusedMoEWithLoRA): - def __init__(self, base_layer): + def __init__(self, base_layer: MoERunner): super().__init__(base_layer) self._w13_slices = 1 @@ -366,8 +375,8 @@ def _create_lora_b_weights(self, max_loras, lora_config): torch.zeros( ( max_loras, - self.routed_experts.local_num_experts, - self.routed_experts.intermediate_size_per_partition * 2, + self.local_num_experts, + self.intermediate_size_per_partition * 2, lora_config.max_lora_rank, ), dtype=lora_config.lora_dtype, @@ -379,10 +388,10 @@ def _create_lora_b_weights(self, max_loras, lora_config): torch.zeros( ( max_loras, - self.routed_experts.local_num_experts, - self.routed_experts.hidden_size + self.local_num_experts, + self.hidden_size if not self.fully_sharded - else divide(self.routed_experts.hidden_size, self.tp_size), + else divide(self.hidden_size, self.tp_size), lora_config.max_lora_rank, ), dtype=lora_config.lora_dtype, @@ -415,7 +424,7 @@ def _slice_w13_b(self, w13_lora_b: torch.Tensor): return w13_lora_b # w13_lora_b shape (num_experts,output_size,rank) - shard_size = self.routed_experts.intermediate_size_per_partition + shard_size = self.intermediate_size_per_partition start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size # HACK: Currently, only GPT-OSS is in interleaved order @@ -503,7 +512,7 @@ def w2_output_size(self): """ Full size """ - return self.routed_experts.hidden_size + return self.hidden_size @classmethod def can_replace_layer( @@ -514,5 +523,5 @@ def can_replace_layer( model_config: PretrainedConfig | None = None, ) -> bool: """Returns True if the layer can be replaced by this LoRA layer.""" - # source_layer is FusedMoE + # source_layer is MoERunner return isinstance(source_layer, MoERunner) and len(packed_modules_list) == 1 diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index b3914463fa29..2e8508f59011 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -392,7 +392,6 @@ def _parent_module(module_name: str) -> str: parts = module_name.split(".")[-1] packed_moduled_lst = self.packed_modules_mapping.get(parts, []) - # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX if isinstance(module, MoERunner): # packed_moduled_lst is used here to just determine whether to # instantiate FusedMoE3DWithLoRA or FusedMoEWithLoRA, and the diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index 6a0c1fd17649..d768c561955b 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -922,7 +922,7 @@ def make_expert_params_mapping( f"experts.{routed_experts_prefix}{base_layer}w13_" if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name] else f"experts.{routed_experts_prefix}{base_layer}w2_", - f"experts.{routed_experts_prefix}{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}", + f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}", expert_id, shard_id, ) diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index b74a255b82ea..362c997cb327 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -86,6 +86,7 @@ def load_weights( return self.routed_experts.load_weights(weights) +# TODO(bnell): Is this still needed? Probably broken if it is. def transformers_moe_forward( hidden_states: torch.Tensor, topk_ids: torch.Tensor, From 27e0f6b32178f14cc54f3e6ad137d0573a3a9a6c Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 7 May 2026 18:17:01 +0000 Subject: [PATCH 159/191] fix lint Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 78372d2bb734..a17eb1a8eb52 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -32,7 +32,6 @@ ) from vllm.model_executor.layers.fused_moe.runner.moe_runner import ( MoERunner, - MoERunnerInterface, ) from vllm.model_executor.layers.fused_moe.utils import ( disable_inplace, @@ -156,7 +155,7 @@ def FusedMoE( apply_routed_scale_to_output: bool = False, zero_expert_type: str | None = None, hash_indices_table: torch.Tensor | None = None, - runner_cls: type[MoERunnerInterface] | None = None, + runner_cls: type[MoERunner] | None = None, runner_args: dict[str, Any] | None = None, routed_experts_cls: type[RoutedExperts] | None = None, routed_experts_args: dict[str, Any] | None = None, From 121517ce277f16c626386d3ea19489f71ae275f6 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 7 May 2026 18:19:51 +0000 Subject: [PATCH 160/191] fix doc Signed-off-by: Bill Nell --- docs/design/moe_kernel_features.md | 2 +- vllm/model_executor/layers/fused_moe/router/base_router.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index 54b796fde3bf..685a38ec0f30 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -60,7 +60,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes. - [`CompressedTensorsW4A4Nvfp4MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.compressed_tensors_moe_w4a4_nvfp4.CompressedTensorsW4A4Nvfp4MoEMethod] - [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.compressed_tensors_moe_w8a8_fp8.CompressedTensorsW8A8Fp8MoEMethod] - [`GptOssMxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.GptOssMxfp4MoEMethod] -- [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod] +- [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.UnquantizedFusedMoEMethod] ## Fused Experts Kernels diff --git a/vllm/model_executor/layers/fused_moe/router/base_router.py b/vllm/model_executor/layers/fused_moe/router/base_router.py index 5b3476ff5406..86b8ca1a853c 100644 --- a/vllm/model_executor/layers/fused_moe/router/base_router.py +++ b/vllm/model_executor/layers/fused_moe/router/base_router.py @@ -160,7 +160,6 @@ def __init__( top_k: Number of experts to select per token global_num_experts: Total number of experts eplb_state: Optional EPLBLayerState for load balancing - indices_type_getter: Optional callback to get indices dtype """ super().__init__() self.top_k = top_k From 446e805c24d48ef31f7086c8061fb14ed9992d8f Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 7 May 2026 21:04:31 +0000 Subject: [PATCH 161/191] move state around Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 7 ----- .../fused_moe/routed_experts_capturer.py | 21 ++++++++++++--- .../layers/fused_moe/router/base_router.py | 2 +- .../fused_moe/router/fused_moe_router.py | 27 ++++++++++++++++++- .../layers/fused_moe/runner/moe_runner.py | 8 ------ 5 files changed, 45 insertions(+), 20 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 49586a6750d9..456f40bbf7a3 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -246,9 +246,6 @@ class FusedMoE(PluggableLayer): not supported by the router (or the experts). """ - # Auto-incrementing layer ID for routing replay buffer binding. - _next_moe_layer_id: int = 0 - # --8<-- [end:fused_moe] def __init__( @@ -293,10 +290,6 @@ def __init__( ): super().__init__() - # Assign unique layer ID for routing replay buffer binding. - self.moe_layer_id = FusedMoE._next_moe_layer_id - FusedMoE._next_moe_layer_id += 1 - if params_dtype is None: params_dtype = torch.get_default_dtype() self.params_dtype = params_dtype diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py index e9d1a8f22008..781661737e42 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py @@ -221,6 +221,9 @@ def get_host_cache(self): def get_device_cache(self): raise NotImplementedError + def map_layer_id(self, layer_id: int) -> int: + raise NotImplementedError + def _count_moe_layers(hf_config) -> int: """Count the number of MoE layers in a model. @@ -293,6 +296,8 @@ def __init__( device=device, ) + self._id_map: dict[int, int] = {} + # ---- Async D2H pipeline (rank-0 only) ---- # Non-rank-0 workers only need the device buffer for symmetric # CUDA graph capture; they skip the D2H pipeline entirely. @@ -476,6 +481,13 @@ def get_host_cache(self): def get_device_cache(self): return self.device_cache + def map_layer_id(self, layer_id: int) -> int: + if layer_id not in self._id_map: + next_id = len(self._id_map) + self._id_map[layer_id] = next_id + return next_id + return self._id_map[layer_id] + class _RoutedExpertsCapturerNoop(RoutedExpertsCapturer): def __init__(self): @@ -499,6 +511,9 @@ def get_host_cache(self): def get_device_cache(self): pass + def map_layer_id(self, layer_id: int) -> int: + return 0 + # Global capturer instance (per-process) _global_expert_capturer: RoutedExpertsCapturer | None = _RoutedExpertsCapturerNoop() @@ -794,7 +809,7 @@ def bind_routing_capture_to_model(model) -> None: bound = 0 for module in model.modules(): - if isinstance(module, FusedMoE) and hasattr(module, "moe_layer_id"): + if isinstance(module, FusedMoE): # Per-FusedMoE configurations not yet validated for routing # capture. These signals are only set after model init, so a # config-level guard cannot see them. @@ -815,9 +830,9 @@ def bind_routing_capture_to_model(model) -> None: f"dp_size={module.moe_config.dp_size})." ) - layer_id = module.moe_layer_id + layer_id = capturer.map_layer_id(module.layer_id) layer_buf = buffer[layer_id] # (N_max, K) - module._routing_replay_out = layer_buf + module.router._routing_replay_out = layer_buf # Mark each per-layer view as static so CUDA graphs don't # snapshot/restore or relocate the buffer during replay. if hasattr(torch.compiler, "cudagraph_mark_tensor_static"): diff --git a/vllm/model_executor/layers/fused_moe/router/base_router.py b/vllm/model_executor/layers/fused_moe/router/base_router.py index 0138eb59c91c..a568fb32f9ba 100644 --- a/vllm/model_executor/layers/fused_moe/router/base_router.py +++ b/vllm/model_executor/layers/fused_moe/router/base_router.py @@ -247,7 +247,7 @@ def _compute_routing( """ raise NotImplementedError - def select_experts( + def _select_experts( self, hidden_states: torch.Tensor, router_logits: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py b/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py index d82085254f9b..94307126a0b4 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py @@ -26,6 +26,16 @@ def set_capture_fn( def routing_method_type(self) -> RoutingMethodType: raise NotImplementedError + @abstractmethod + def _select_experts( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + *, + input_ids: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + raise NotImplementedError + @abstractmethod def select_experts( self, @@ -47,4 +57,19 @@ def select_experts( equivalent to global logical ids, so should be compatible with plain MoE implementations without redundant experts. """ - raise NotImplementedError + + topk_weights, topk_ids = self._select_experts( + hidden_states, + router_logits, + input_ids=input_ids, + ) + + # Get routing replay buffer from persistent attribute + # (set by bind_routing_capture_to_model during capturer init) + routing_replay_out = getattr(self, "_routing_replay_out", None) + + # Write routing data for non-monolithic path (Triton, etc.) + if routing_replay_out is not None: + routing_replay_out[: topk_ids.shape[0]].copy_(topk_ids.to(torch.int16)) + + return topk_weights, topk_ids diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index 543049060c1c..2eee8acf6b8f 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -451,10 +451,6 @@ def _apply_quant_method( shared_experts_input, SharedExpertsOrder.NO_OVERLAP ) - # Get routing replay buffer from persistent layer attribute - # (set by bind_routing_capture_to_model during capturer init) - routing_replay_out = getattr(layer, "_routing_replay_out", None) - if self._quant_method.is_monolithic: fused_out = self._quant_method.apply_monolithic( layer=layer, @@ -469,10 +465,6 @@ def _apply_quant_method( input_ids=input_ids, ) - # Write routing data for non-monolithic path (Triton, etc.) - if routing_replay_out is not None: - routing_replay_out[: topk_ids.shape[0]].copy_(topk_ids.to(torch.int16)) - # Passing shared_experts_input in case SharedExpertsOrder is # MK_INTERNAL_OVERLAPPED. fused_out = self._quant_method.apply( From bd1b8cccebb4ea03325a3c729f29a2df13196e68 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 7 May 2026 21:08:09 +0000 Subject: [PATCH 162/191] update test Signed-off-by: Bill Nell --- .../test_routed_experts_capture.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/model_executor/test_routed_experts_capture.py b/tests/model_executor/test_routed_experts_capture.py index d33d716c1d14..14a88b9392e4 100644 --- a/tests/model_executor/test_routed_experts_capture.py +++ b/tests/model_executor/test_routed_experts_capture.py @@ -18,13 +18,19 @@ class _DummyMoEConfig: class _DummyQuantMethod: supports_internal_mk = True - class DummyFusedMoE: - _routing_replay_out: torch.Tensor + class _DummyRouter: + _routing_replay_out: torch.Tensor | None = None + class DummyFusedMoE: def __init__(self, moe_layer_id): self.moe_layer_id = moe_layer_id self.moe_config = _DummyMoEConfig() self.quant_method = _DummyQuantMethod() + self.router = _DummyRouter() + + @property + def layer_id(self) -> int: + return self.moe_layer_id monkeypatch.setattr(fused_moe_layer, "FusedMoE", DummyFusedMoE) @@ -39,6 +45,9 @@ class DummyCapturer: def get_device_cache(self): return DummyDeviceCache(buffer) + def map_layer_id(self, id: int) -> int: + return id + monkeypatch.setattr(rec_mod, "get_global_experts_capturer", lambda: DummyCapturer()) m0 = DummyFusedMoE(moe_layer_id=0) @@ -50,8 +59,8 @@ def modules(self): rec_mod.bind_routing_capture_to_model(DummyModel()) - assert torch.equal(m0._routing_replay_out, buffer[0]) - assert torch.equal(m2._routing_replay_out, buffer[2]) + assert torch.equal(m0.router._routing_replay_out, buffer[0]) + assert torch.equal(m2.router._routing_replay_out, buffer[2]) def test_bind_routing_capture_to_model_noop_when_disabled(monkeypatch): From b1573e4d7d6b436e366d446c68c51c64d8ee9a37 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 7 May 2026 21:17:06 +0000 Subject: [PATCH 163/191] fix lint Signed-off-by: Bill Nell --- .../layers/fused_moe/router/fused_moe_router.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py b/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py index 94307126a0b4..d0bd9c823ab6 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py @@ -14,6 +14,9 @@ class FusedMoERouter(ABC): method that is used for routing hidden states based on router logits. """ + def __init__(self): + self._routing_replay_out: torch.Tensor | None = None + @abstractmethod def set_capture_fn( self, @@ -36,7 +39,6 @@ def _select_experts( ) -> tuple[torch.Tensor, torch.Tensor]: raise NotImplementedError - @abstractmethod def select_experts( self, hidden_states: torch.Tensor, From d2bc7dfb57ff75a19dadacb081695cbccca5da9b Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 7 May 2026 21:17:51 +0000 Subject: [PATCH 164/191] fix lint Signed-off-by: Bill Nell --- .../layers/fused_moe/router/fused_moe_router.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py b/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py index d0bd9c823ab6..5acda8ec6755 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py @@ -66,12 +66,11 @@ def select_experts( input_ids=input_ids, ) - # Get routing replay buffer from persistent attribute - # (set by bind_routing_capture_to_model during capturer init) - routing_replay_out = getattr(self, "_routing_replay_out", None) - # Write routing data for non-monolithic path (Triton, etc.) - if routing_replay_out is not None: - routing_replay_out[: topk_ids.shape[0]].copy_(topk_ids.to(torch.int16)) + # (set by bind_routing_capture_to_model during capturer init) + if self._routing_replay_out is not None: + self._routing_replay_out[: topk_ids.shape[0]].copy_( + topk_ids.to(torch.int16) + ) return topk_weights, topk_ids From 64ccbf85adfa6fbd4669ae29a5087eb6baef1d95 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 11 May 2026 18:49:44 +0000 Subject: [PATCH 165/191] use layer_name instead of layer_id for map Signed-off-by: Bill Nell --- .../test_routed_experts_capture.py | 16 ++++++---------- vllm/model_executor/layers/fused_moe/layer.py | 2 ++ .../layers/fused_moe/routed_experts_capturer.py | 16 ++++++++-------- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/tests/model_executor/test_routed_experts_capture.py b/tests/model_executor/test_routed_experts_capture.py index 14a88b9392e4..834e4a38617b 100644 --- a/tests/model_executor/test_routed_experts_capture.py +++ b/tests/model_executor/test_routed_experts_capture.py @@ -22,16 +22,12 @@ class _DummyRouter: _routing_replay_out: torch.Tensor | None = None class DummyFusedMoE: - def __init__(self, moe_layer_id): - self.moe_layer_id = moe_layer_id + def __init__(self, name: str): + self.layer_name = name self.moe_config = _DummyMoEConfig() self.quant_method = _DummyQuantMethod() self.router = _DummyRouter() - @property - def layer_id(self) -> int: - return self.moe_layer_id - monkeypatch.setattr(fused_moe_layer, "FusedMoE", DummyFusedMoE) num_layers, num_tokens, top_k = 4, 8, 2 @@ -45,13 +41,13 @@ class DummyCapturer: def get_device_cache(self): return DummyDeviceCache(buffer) - def map_layer_id(self, id: int) -> int: - return id + def map_layer_to_id(self, name: str) -> int: + return int(name) monkeypatch.setattr(rec_mod, "get_global_experts_capturer", lambda: DummyCapturer()) - m0 = DummyFusedMoE(moe_layer_id=0) - m2 = DummyFusedMoE(moe_layer_id=2) + m0 = DummyFusedMoE(name="0") + m2 = DummyFusedMoE(name="2") class DummyModel: def modules(self): diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index ee5bd6f81622..01bffc9ab717 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -334,6 +334,8 @@ def __init__( # Expert mapping used in self.load_weights self.expert_mapping = expert_mapping + print(f"PREFIX = {prefix}") + # For smuggling this layer into the fused moe custom op compilation_config = vllm_config.compilation_config if prefix in compilation_config.static_forward_context: diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py index 781661737e42..77ce6325338c 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py @@ -221,7 +221,7 @@ def get_host_cache(self): def get_device_cache(self): raise NotImplementedError - def map_layer_id(self, layer_id: int) -> int: + def map_layer_to_id(self, layer_name: str) -> int: raise NotImplementedError @@ -296,7 +296,7 @@ def __init__( device=device, ) - self._id_map: dict[int, int] = {} + self._id_map: dict[str, int] = {} # ---- Async D2H pipeline (rank-0 only) ---- # Non-rank-0 workers only need the device buffer for symmetric @@ -481,12 +481,12 @@ def get_host_cache(self): def get_device_cache(self): return self.device_cache - def map_layer_id(self, layer_id: int) -> int: - if layer_id not in self._id_map: + def map_layer_to_id(self, layer_name: str) -> int: + if layer_name not in self._id_map: next_id = len(self._id_map) - self._id_map[layer_id] = next_id + self._id_map[layer_name] = next_id return next_id - return self._id_map[layer_id] + return self._id_map[layer_name] class _RoutedExpertsCapturerNoop(RoutedExpertsCapturer): @@ -511,7 +511,7 @@ def get_host_cache(self): def get_device_cache(self): pass - def map_layer_id(self, layer_id: int) -> int: + def map_layer_to_id(self, layer_name: str) -> int: return 0 @@ -830,7 +830,7 @@ def bind_routing_capture_to_model(model) -> None: f"dp_size={module.moe_config.dp_size})." ) - layer_id = capturer.map_layer_id(module.layer_id) + layer_id = capturer.map_layer_to_id(module.layer_name) layer_buf = buffer[layer_id] # (N_max, K) module.router._routing_replay_out = layer_buf # Mark each per-layer view as static so CUDA graphs don't From 3de6da7a89985e85d46a06cbcae98c1ea78550f1 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 11 May 2026 18:57:34 +0000 Subject: [PATCH 166/191] update doc Signed-off-by: Bill Nell --- docs/training/routed_experts_replay.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/training/routed_experts_replay.md b/docs/training/routed_experts_replay.md index 60eae33b848e..7a269f01e932 100644 --- a/docs/training/routed_experts_replay.md +++ b/docs/training/routed_experts_replay.md @@ -109,7 +109,7 @@ When a request has multiple completions (`n > 1`), each completion shares the sa ```text Forward Pass Async D2H Pipeline Output ───────────── ────────────────── ────── -FusedMoE layer After forward pass: On request finish: +FusedMoERouter After forward pass: On request finish: writes topk_ids ──────► D2H copy to pinned ──────► Extract from host cache to device buffer staging buffer Split at prompt_len (L, N, K) int16 (via CUDA stream) Trim gen to output len @@ -125,7 +125,9 @@ A pre-allocated GPU buffer with layout `(L, N, K)` where: - `N` = `max_num_batched_tokens` - `K` = `num_experts_per_tok` (top-k) -The `(L, N, K)` layout ensures that `buffer[layer_id]` gives a contiguous `(N, K)` view per layer. Each `FusedMoE` layer gets a persistent reference to its slice via `module._routing_replay_out = buffer[layer_id]`. +The `(L, N, K)` layout ensures that `buffer[layer_id]` gives a contiguous `(N, K)` view per layer. Each `FusedMoERouter` layer gets a persistent reference to its slice via `router._routing_replay_out = buffer[layer_id]`. + +The `layer_id`s are managed by `RoutedExpertsCapturer` and keyed by `FusedMoE.layer_name`. **Dtype**: `int16` — sufficient for expert IDs (max ~512 experts in practice) and half the memory of `int32`. @@ -146,7 +148,7 @@ This design ensures the D2H copy overlaps with the next forward pass, minimizing CUDA graph compatibility requires two mechanisms: -1. **Persistent tensor attribute**: Each `FusedMoE` layer stores a reference to its buffer slice as `module._routing_replay_out`. Because `torch.compile` captures module attributes by reference, graph replay always writes to the live buffer — not a stale snapshot. +1. **Persistent tensor attribute**: Each `FusedMoERouter` stores a reference to its buffer slice as `router._routing_replay_out`. Because `torch.compile` captures module attributes by reference, graph replay always writes to the live buffer — not a stale snapshot. 2. **Static marking**: Both the full `(L, N, K)` buffer and each per-layer `(N, K)` view are marked with `cudagraph_mark_tensor_static()`. This prevents CUDA graphs from snapshot/restore behavior that would zero the buffer on replay. From 3c52f2cd32aa219d9fdf3f36ffbcd84f3a23307b Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 11 May 2026 20:04:15 +0000 Subject: [PATCH 167/191] remove debug print Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 01bffc9ab717..ee5bd6f81622 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -334,8 +334,6 @@ def __init__( # Expert mapping used in self.load_weights self.expert_mapping = expert_mapping - print(f"PREFIX = {prefix}") - # For smuggling this layer into the fused moe custom op compilation_config = vllm_config.compilation_config if prefix in compilation_config.static_forward_context: From 474bfc429abdf2e23c30b6715459178fa89ea3fe Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 12 May 2026 20:32:55 +0000 Subject: [PATCH 168/191] fix merge Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/router/fused_moe_router.py | 2 ++ vllm/model_executor/layers/fused_moe/runner/moe_runner.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py b/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py index c77fb05cd3e0..306f9c199961 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_moe_router.py @@ -36,6 +36,7 @@ def _select_experts( self, hidden_states: torch.Tensor, router_logits: torch.Tensor, + topk_indices_dtype: torch.dtype | None = None, *, input_ids: torch.Tensor | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: @@ -66,6 +67,7 @@ def select_experts( topk_weights, topk_ids = self._select_experts( hidden_states, router_logits, + topk_indices_dtype=topk_indices_dtype, input_ids=input_ids, ) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index 2bfe44fbc749..eed6c1f7fc3d 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -648,7 +648,7 @@ def forward( shared_experts_input, input_ids, self._encode_layer_name(), - self._trtllm_mxfp4_unpadded_dim(), + # self._trtllm_mxfp4_unpadded_dim(), ) # From 53884ac9416f6fd0e57e2b283e948833f3fcc146 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 12 May 2026 18:05:26 -0400 Subject: [PATCH 169/191] cleaner fix for unpadded output Signed-off-by: Bill Nell --- .../layers/fused_moe/fused_moe_method_base.py | 8 +++ .../fused_moe/fused_moe_modular_method.py | 8 +++ .../layers/fused_moe/runner/moe_runner.py | 51 ++++++++----------- .../layers/quantization/mxfp4.py | 14 +++++ 4 files changed, 51 insertions(+), 30 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py index 7c4c05a648fc..888d064d3115 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py @@ -144,6 +144,14 @@ def skip_forward_padding(self) -> bool: """Whether to skip the padding in the forward before applying the moe method.""" return False + @property + def has_unpadded_output(self) -> bool: + """ + Indicates that the hidden_states output might be the unpadded + hidden_states shape rather than the full padded shape. + """ + return False + @property def supports_eplb(self) -> bool: return False diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index be5e158b12bf..704d6954e982 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -64,6 +64,14 @@ def make( ), ) + @property + def skip_forward_padding(self) -> bool: + return self.old_quant_method.skip_forward_padding + + @property + def has_unpadded_output(self) -> bool: + return self.old_quant_method.has_unpadded_output + @property def supports_eplb(self) -> bool: return self.old_quant_method.supports_eplb diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index eed6c1f7fc3d..a88d9bb0df43 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -120,6 +120,7 @@ def _moe_forward( shared_experts_input: torch.Tensor | None, input_ids: torch.Tensor | None, layer_name: _layer_name_type, + hidden_dim_unpadded: int, ) -> torch.Tensor: layer = get_layer_from_name(_resolve_layer_name(layer_name)) return layer._forward_impl( @@ -136,7 +137,14 @@ def _moe_forward_fake( shared_experts_input: torch.Tensor | None, input_ids: torch.Tensor | None, layer_name: _layer_name_type, + hidden_dim_unpadded: int, ) -> torch.Tensor: + # `hidden_dim_unpadded > 0` only on the TRT-LLM MXFP4 path, where the + # real kernel writes narrower than `hidden_states.shape[-1]`. Plumbed + # as an op arg (not peeked from the layer registry) to keep the fake + # a pure shape function of its inputs and preserve subgraph dedup. + if hidden_dim_unpadded > 0: + return hidden_states.new_empty((*hidden_states.shape[:-1], hidden_dim_unpadded)) return torch.empty_like(hidden_states) @@ -162,13 +170,17 @@ def _moe_forward_shared_fake( shared_experts_input: torch.Tensor | None, input_ids: torch.Tensor | None, layer_name: _layer_name_type, + hidden_dim_unpadded: int, ) -> tuple[torch.Tensor, torch.Tensor]: - # Output shapes: - # - fused_out: same as hidden_states (routed experts use transformed size) - # - shared_out: same as shared_experts_input if provided, else same as - # hidden_states - # (For latent MoE: shared experts use original hidden_size, not latent size) - fused_out = torch.empty_like(hidden_states) + # `fused_out`: see `_moe_forward_fake` for hidden_dim_unpadded semantics. + # `shared_out`: matches `shared_experts_input` if provided (latent MoE), + # else `hidden_states`. + if hidden_dim_unpadded > 0: + fused_out = hidden_states.new_empty( + (*hidden_states.shape[:-1], hidden_dim_unpadded) + ) + else: + fused_out = torch.empty_like(hidden_states) if shared_experts_input is not None: shared_out = torch.empty_like(shared_experts_input) else: @@ -439,29 +451,6 @@ def _encode_layer_name(self) -> str | LayerName: return "from_forward_context" return self.layer_name - def _trtllm_mxfp4_unpadded_dim(self) -> int: - """Return ``hidden_dim_unpadded`` when the active backend is TRT-LLM - MXFP4 (whose kernel writes narrower than the padded - ``hidden_states.shape[-1]``), else 0. Other MXFP4 backends (notably - Cutlass MXFP4 MXFP8) write the full padded width, so - ``moe_config.hidden_dim_unpadded`` alone is insufficient: it encodes - the model's logical hidden, not whether the kernel narrows. Computed - caller-side and passed as an op arg; doing the isinstance check - inside the fake would specialize per ``layer_name`` and break - subgraph dedup for identical-architecture models (e.g. Phi-MoE). - """ - from vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe import ( - TrtLlmMxfp4ExpertsBase, - ) - - moe_kernel = getattr(self._quant_method, "moe_kernel", None) - fused_experts = getattr( - getattr(moe_kernel, "impl", None), "fused_experts", None - ) - if isinstance(fused_experts, TrtLlmMxfp4ExpertsBase): - return self.moe_config.hidden_dim_unpadded or self.moe_config.hidden_dim - return 0 - def _maybe_pad_hidden_states( self, shared_experts_input: torch.Tensor | None, @@ -648,7 +637,9 @@ def forward( shared_experts_input, input_ids, self._encode_layer_name(), - # self._trtllm_mxfp4_unpadded_dim(), + self.moe_config.hidden_dim_unpadded + if self._quant_method.has_unpadded_output + else 0, ) # diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 2c69fc74530d..fb9d760b5e2d 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -158,6 +158,13 @@ def skip_forward_padding(self) -> bool: # so can skip the padding in the forward before applying the moe method return self.mxfp4_backend == Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8 + @property + def has_unpadded_output(self) -> bool: + return self.mxfp4_backend in [ + Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8, + Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16, + ] + def maybe_roundup_sizes( self, hidden_size: int, @@ -492,6 +499,13 @@ def skip_forward_padding(self) -> bool: # so can skip the padding in the forward before applying the moe method return self.mxfp4_backend == Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8 + @property + def has_unpadded_output(self) -> bool: + return self.mxfp4_backend in [ + Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8, + Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16, + ] + def maybe_roundup_sizes( self, hidden_size: int, From 37b076b25b20a0b764874835c55b9999366bb913 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 12 May 2026 18:21:00 -0400 Subject: [PATCH 170/191] fix Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/runner/moe_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index a88d9bb0df43..8d788d6a47ba 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -154,6 +154,7 @@ def _moe_forward_shared( shared_experts_input: torch.Tensor | None, input_ids: torch.Tensor | None, layer_name: _layer_name_type, + hidden_dim_unpadded: int, ) -> tuple[torch.Tensor, torch.Tensor]: layer = get_layer_from_name(_resolve_layer_name(layer_name)) return layer._forward_impl( From 535d224af1bfd5c43e20c1545d611aac22e82c7a Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 20 May 2026 21:54:21 +0000 Subject: [PATCH 171/191] lora + misc merge fixes Signed-off-by: Bill Nell --- vllm/lora/layers/fused_moe.py | 25 ++++++++++++++++--- vllm/lora/model_manager.py | 18 ++++++------- .../layers/fused_moe/runner/moe_runner.py | 4 +++ .../layers/quantization/humming.py | 3 ++- vllm/utils/import_utils.py | 5 ++++ 5 files changed, 41 insertions(+), 14 deletions(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index ea941c5e6482..234373055703 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -35,11 +35,13 @@ def __init__(self, base_layer: MoERunner) -> None: "Monolithic kernels are not supported for Fused MoE LoRA." ) + moe_parallel_config = base_layer.moe_config.moe_parallel_config + # Use the MoE-aware TP rank/size: when EP is active, FusedMoE collapses # moe_parallel_config.tp_size to 1 (experts are sharded across the # TP group instead). - self.tp_size = self.base_layer.tp_size - self.tp_rank = self.base_layer.tp_rank + self.tp_size = moe_parallel_config.tp_size + self.tp_rank = moe_parallel_config.tp_rank self.device = _get_lora_device(base_layer) # For non-gated MoE (is_act_and_mul=False), only 1 slice is needed # since there's only up_proj (w1), not gate_proj + up_proj (w1 + w3) @@ -47,7 +49,7 @@ def __init__(self, base_layer: MoERunner) -> None: # Mirrors per-(lora_id) layout of `self.lora_a_stacked` (built in # `create_lora_weights`) so `create_dummy_lora`'s n_slices fallback # matches `lora_a_stacked` length under EP. - self.n_slices = base_layer.local_num_experts * (self._w13_slices + 1) + self.n_slices = self.local_num_experts * (self._w13_slices + 1) routed_experts._ensure_moe_quant_config_init() if getattr(routed_experts.quant_method, "supports_internal_mk", False): @@ -80,6 +82,20 @@ def hidden_size(self) -> int: def local_num_experts(self) -> int: return self.base_layer.moe_config.num_local_experts + @property + def global_num_experts(self) -> int: + return self.base_layer.moe_config.num_experts + + @property + def ep_rank(self) -> int: + moe_config = self.base_layer.moe_config + return moe_config.moe_parallel_config.ep_rank + + @property + def use_ep(self) -> int: + moe_config = self.base_layer.moe_config + return moe_config.moe_parallel_config.use_ep + @property def intermediate_size_per_partition(self) -> int: return self.base_layer.moe_config.intermediate_size_per_partition @@ -181,7 +197,8 @@ def _verify_ep_fs(self, lora_config: LoRAConfig): # EP on the expert dim, fully_sharded on the LoRA rank dim — with # mutually contradictory assumptions about which rank holds which # expert's rank-shard. - assert not (self.base_layer.use_ep and lora_config.fully_sharded_loras), ( + routed_experts = self.base_layer.routed_experts + assert not (routed_experts.use_ep and lora_config.fully_sharded_loras), ( "Fused MoE LoRA does not support enable_expert_parallel=True " "together with fully_sharded_loras=True. Disable one of them." ) diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index 31806772d431..e7aa9e5f5c3d 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -823,8 +823,8 @@ def _stack_moe_lora_weights( # owned expert range before it gets copied into the local # stacked buffer. For non-EP (local == global) this is a # no-op slice. - global_num_experts = module.base_layer.global_num_experts - ep_rank = module.base_layer.ep_rank + global_num_experts = module.global_num_experts + ep_rank = module.ep_rank expert_start = ep_rank * local_num_experts expert_end = expert_start + local_num_experts @@ -927,9 +927,9 @@ def _convert_3d_to_2d_moe_lora( # untouched so set_lora can raise a clear error if needed. return - local_num_experts = module.base_layer.local_num_experts - global_num_experts = module.base_layer.global_num_experts - ep_rank = module.base_layer.ep_rank + local_num_experts = module.local_num_experts + global_num_experts = module.global_num_experts + ep_rank = module.ep_rank expert_start = ep_rank * local_num_experts expert_end = expert_start + local_num_experts @@ -999,15 +999,15 @@ def _slice_moe_lora_ep( the CPU LoRAModel keeps the full global weight and set_lora has to re-slice on every activation. """ - if not module.base_layer.use_ep: + if not module.use_ep: return module_lora = self._get_lora_layer_weights(lora_model, module_name) if module_lora is None or not isinstance(module_lora.lora_a, list): return - local_num_experts = module.base_layer.local_num_experts - global_num_experts = module.base_layer.global_num_experts - ep_rank = module.base_layer.ep_rank + local_num_experts = module.local_num_experts + global_num_experts = module.global_num_experts + ep_rank = module.ep_rank expert_start = ep_rank * local_num_experts expert_end = expert_start + local_num_experts diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index 8d788d6a47ba..a2df08552274 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -306,6 +306,10 @@ def _select_forward(self) -> Callable: def shared_experts(self) -> SharedExperts | None: return self._shared_experts + @property + def is_internal_router(self) -> bool: + return self.gate is not None + # TODO(bnell): Temporary hack. Get rid of this. def _replace_quant_method(self, quant_method: FusedMoEMethodBase): self.routed_experts.quant_method = quant_method diff --git a/vllm/model_executor/layers/quantization/humming.py b/vllm/model_executor/layers/quantization/humming.py index 2920fe673689..a176bf654d81 100644 --- a/vllm/model_executor/layers/quantization/humming.py +++ b/vllm/model_executor/layers/quantization/humming.py @@ -42,8 +42,9 @@ ) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform +from vllm.utils.import_utils import has_humming -if current_platform.is_cuda(): +if has_humming() and current_platform.is_cuda(): from humming.dtypes import DataType from humming.layer import HummingMethod from humming.schema import ( diff --git a/vllm/utils/import_utils.py b/vllm/utils/import_utils.py index 5822e5840afc..b61e9c182683 100644 --- a/vllm/utils/import_utils.py +++ b/vllm/utils/import_utils.py @@ -474,3 +474,8 @@ def has_fbgemm_gpu() -> bool: def has_cutedsl() -> bool: """Whether the optional `cutelass` package is available.""" return _has_module("cutlass") + + +def has_humming() -> bool: + """Whether the optional `humming` package is available.""" + return _has_module("humming") From bdae82e0cf1829cc2fa4ef798fb9a58303cb7fe9 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 20 May 2026 23:08:26 +0000 Subject: [PATCH 172/191] merge/lint fix Signed-off-by: Bill Nell --- tests/kernels/moe/test_flashinfer_b12x_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/kernels/moe/test_flashinfer_b12x_moe.py b/tests/kernels/moe/test_flashinfer_b12x_moe.py index ec0a9594fe12..29b5ecf850a3 100644 --- a/tests/kernels/moe/test_flashinfer_b12x_moe.py +++ b/tests/kernels/moe/test_flashinfer_b12x_moe.py @@ -166,7 +166,7 @@ def test_flashinfer_b12x_moe( num_experts=e, experts_per_token=topk, hidden_dim=k, - intermediate_size_per_partition=n, + intermediate_size=n, in_dtype=dtype, ) From 1959f6dcfbb192568d40000dc1e6d9a6e9ec3920 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 20 May 2026 23:23:57 +0000 Subject: [PATCH 173/191] update FusedMoE comment Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 68 ++++++++++++++----- 1 file changed, 50 insertions(+), 18 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index d3c9fee10117..0dd0f74f4ac3 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -148,34 +148,66 @@ def FusedMoE( routed_experts_cls: type[RoutedExperts] | None = None, routed_experts_args: dict[str, Any] | None = None, ) -> MoERunner: - # TODO update comment - """FusedMoE layer builder for MoE models. + """Factory function for creating MoE execution pipeline. - This layer contains both MergedColumnParallel weights (gate_up_proj / - w13) and RowParallelLinear weights (down_proj/ w2). + Creates and configures a complete MoE execution pipeline including: + - Router (for token-to-expert assignment) + - RoutedExperts (containing expert weight parameters) + - MoERunner (orchestrates the complete forward pass) + + The experts contain both MergedColumnParallel weights (gate_up_proj/w13) + and RowParallelLinear weights (down_proj/w2). Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We copy that naming convention here and handle any remapping in the load_weights function in each model implementation. Args: - num_experts: Number of experts in the model + num_experts: Number of experts in the model (global count) top_k: Number of experts selected for each token hidden_size: Input hidden state size of the transformer intermediate_size: Intermediate size of the experts - params_dtype: Data type for the parameters. - renormalize: Whether to renormalize the logits in the fused_moe kernel - quant_config: Quantization configure. - enable_eplb: Whether to enable expert parallelism load balancer. - router_logits_dtype: Data type for router logits buffers. - routed_scaling_factor: A scaling factor that is applied to the topk_weights - by the router or the output of the layer depending - on the value of `apply_routed_scale_to_output` - apply_routed_scale_to_output: Determine whether or not `routed_scaling_factor` - is applied to the topk_weights or to the experts - output. It is applied to the experts output - instead of the topk_weights when this feature is - not supported by the router (or the experts). + params_dtype: Data type for the parameters + renormalize: Whether to renormalize the logits in the router + use_grouped_topk: Whether to use grouped top-k routing + num_expert_group: Number of expert groups for grouped top-k + topk_group: Top-k value per group for grouped top-k + quant_config: Quantization configuration + tp_size: Tensor parallelism size (None = use global default) + dp_size: Data parallelism size (None = use global default) + pcp_size: Pipeline context parallelism size (None = use global default) + prefix: Layer name prefix for weight loading + custom_routing_function: Custom routing function override + router: Pre-configured router instance (None = create default) + scoring_func: Scoring function for routing ("softmax" or others) + routed_scaling_factor: Scaling factor applied to topk_weights or output + swiglu_limit: SwiGLU activation limit + e_score_correction_bias: Expert score correction bias tensor + apply_router_weight_on_input: Whether to apply router weights on input + activation: Activation function name ("silu", "gelu", etc.) + enable_eplb: Whether to enable expert parallelism load balancer + num_redundant_experts: Number of redundant experts for EPLB + has_bias: Whether expert layers have bias terms + is_sequence_parallel: Whether sequence parallelism is enabled + expert_mapping: Expert parameter mapping for weight loading + n_shared_experts: Number of shared experts (ROCm aiter only) + router_logits_dtype: Data type for router logits buffers + gate: Pre-configured gate module + shared_experts: Pre-configured shared experts module + shared_expert_gate: Pre-configured shared expert gate module + routed_input_transform: Input transformation module + routed_output_transform: Output transformation module + apply_routed_scale_to_output: Whether to apply routed_scaling_factor to + output instead of topk_weights + zero_expert_type: Type of zero expert handling + hash_indices_table: Hash table for expert indices + runner_cls: Custom MoERunner class (None = use default MoERunner) + runner_args: Additional arguments for runner constructor + routed_experts_cls: Custom RoutedExperts class (None = use default) + routed_experts_args: Additional arguments for routed_experts constructor + + Returns: + MoERunner: Configured MoE execution pipeline ready for forward passes """ vllm_config = get_current_vllm_config() From 9028c9a8439196bb9885ac53579dc33b46608262 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 21 May 2026 00:35:38 +0000 Subject: [PATCH 174/191] add missing type annotation Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 0dd0f74f4ac3..16769f1468ee 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -131,7 +131,7 @@ def FusedMoE( enable_eplb: bool = False, num_redundant_experts: int = 0, has_bias: bool = False, - is_sequence_parallel=False, + is_sequence_parallel: bool = False, expert_mapping: list[tuple[str, str, int, str]] | None = None, n_shared_experts: int | None = None, router_logits_dtype: torch.dtype | None = None, From 0c7b0747f88634b8f7d4826a6f92f4dff3a54bd2 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 28 May 2026 23:36:51 +0000 Subject: [PATCH 175/191] cleanups Signed-off-by: Bill Nell --- vllm/lora/layers/fused_moe.py | 32 ++++++++----------- vllm/lora/model_manager.py | 8 ++--- .../fused_moe/experts/fused_humming_moe.py | 1 - vllm/model_executor/layers/fused_moe/layer.py | 2 +- .../layers/fused_moe/routed_experts.py | 6 +++- .../layers/fused_moe/runner/moe_runner.py | 14 +++----- .../layers/fused_moe/runner/shared_experts.py | 6 ++-- .../layers/quantization/mxfp4.py | 2 ++ .../layers/quantization/utils/marlin_utils.py | 2 +- vllm/model_executor/models/gpt_oss.py | 2 +- 10 files changed, 36 insertions(+), 39 deletions(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index ffd226d35dde..c562dad33151 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -28,6 +28,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): def __init__(self, base_layer: MoERunner) -> None: super().__init__() self.base_layer = base_layer + self.moe_config = base_layer.moe_config self._shared_experts = base_layer._shared_experts self._ep_check() @@ -36,11 +37,10 @@ def __init__(self, base_layer: MoERunner) -> None: "Monolithic kernels are not supported for Fused MoE LoRA." ) - moe_parallel_config = base_layer.moe_config.moe_parallel_config - # Use the MoE-aware TP rank/size: when EP is active, FusedMoE collapses # moe_parallel_config.tp_size to 1 (experts are sharded across the # TP group instead). + moe_parallel_config = self.moe_config.moe_parallel_config self.tp_size = moe_parallel_config.tp_size self.tp_rank = moe_parallel_config.tp_rank self.device = _get_lora_device(base_layer) @@ -80,29 +80,27 @@ def __init__(self, base_layer: MoERunner) -> None: @property def hidden_size(self) -> int: - return self.base_layer.moe_config.hidden_dim + return self.moe_config.hidden_dim @property def local_num_experts(self) -> int: - return self.base_layer.moe_config.num_local_experts + return self.moe_config.num_local_experts @property def global_num_experts(self) -> int: - return self.base_layer.moe_config.num_experts + return self.moe_config.num_experts @property def ep_rank(self) -> int: - moe_config = self.base_layer.moe_config - return moe_config.moe_parallel_config.ep_rank + return self.moe_config.moe_parallel_config.ep_rank @property - def use_ep(self) -> int: - moe_config = self.base_layer.moe_config - return moe_config.moe_parallel_config.use_ep + def use_ep(self) -> bool: + return self.moe_config.moe_parallel_config.use_ep @property def intermediate_size_per_partition(self) -> int: - return self.base_layer.moe_config.intermediate_size_per_partition + return self.moe_config.intermediate_size_per_partition def _init_lora_stream_context(self) -> None: self._lora_stream: torch.cuda.Stream | None = None @@ -118,7 +116,6 @@ def _init_lora_stream_context(self) -> None: self._events = tuple(torch.cuda.Event() for _ in range(4)) def _build_lora_context(self): - moe_config = self.base_layer.moe_config use_dual_stream = ( self._enable_aux_cuda_stream and not self.fully_sharded @@ -131,7 +128,8 @@ def _build_lora_context(self): w2_lora_b_stacked=self.w2_lora_b_stacked, adapter_enabled=self.adapter_enabled, max_loras=self.max_loras, - top_k=moe_config.experts_per_token, + # top_k=self.moe_config.experts_per_token, + top_k=self.base_layer.routed_experts.top_k, w13_num_slices=self._w13_slices, fully_sharded=self.fully_sharded, tp_rank=self.tp_rank, @@ -206,9 +204,8 @@ def _create_lora_b_weights(self, max_loras: int, lora_config: LoRAConfig): ) def _ep_check(self): - routed_experts = self.base_layer.routed_experts - if routed_experts.use_ep: - moe_config = routed_experts.moe_config + if self.use_ep: + moe_config = self.moe_config all2all_backend = moe_config.moe_parallel_config.all2all_backend assert all2all_backend == "allgather_reducescatter", ( "Fused MoE LoRA with EP currently only supports " @@ -221,8 +218,7 @@ def _verify_ep_fs(self, lora_config: LoRAConfig): # EP on the expert dim, fully_sharded on the LoRA rank dim — with # mutually contradictory assumptions about which rank holds which # expert's rank-shard. - routed_experts = self.base_layer.routed_experts - assert not (routed_experts.use_ep and lora_config.fully_sharded_loras), ( + assert not (self.use_ep and lora_config.fully_sharded_loras), ( "Fused MoE LoRA does not support enable_expert_parallel=True " "together with fully_sharded_loras=True. Disable one of them." ) diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index 3e3296fea882..4cdda050ae64 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -1097,11 +1097,11 @@ def _build_moe_ep_load_spec(self) -> MoEEPLoadSpec | None: ) if module is None: return None - base = module.base_layer + # base = module.base_layer return MoEEPLoadSpec( - ep_rank=base.ep_rank, - local_num_experts=base.local_num_experts, - global_num_experts=base.global_num_experts, + ep_rank=module.ep_rank, + local_num_experts=module.local_num_experts, + global_num_experts=module.global_num_experts, ) def _get_lora_layer_weights( diff --git a/vllm/model_executor/layers/fused_moe/experts/fused_humming_moe.py b/vllm/model_executor/layers/fused_moe/experts/fused_humming_moe.py index 928d17f5ec90..8874228a1429 100644 --- a/vllm/model_executor/layers/fused_moe/experts/fused_humming_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/fused_humming_moe.py @@ -67,7 +67,6 @@ def get_humming_moe_gemm_type() -> str: class HummingExpertsBase(mk.FusedMoEExpertsModular): def __init__( self, - # TODO(bnell): this should not be passed to the MK. layer: "RoutedExperts", moe_config: FusedMoEConfig, quant_config: FusedMoEQuantConfig, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 88a7c2a610a7..19d6893d335a 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -319,10 +319,10 @@ def FusedMoE( max_num_tokens=max_num_batched_tokens, has_bias=has_bias, is_lora_enabled=vllm_config.lora_config is not None, - swiglu_limit=swiglu_limit, activation=moe_activation, device=vllm_config.device_config.device, routing_method=router.routing_method_type, # Not ideal + swiglu_limit=swiglu_limit, ) logger.debug("FusedMoEConfig = %s", moe_config) diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index 20c8be0c71e5..6afe4c024db7 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -78,7 +78,11 @@ def __init__( self.rocm_aiter_fmoe_enabled = moe_config.rocm_aiter_fmoe_enabled - # Bit of hack until things are settled + # Set any remaining kwargs as attributes. This is necessary because + # it is not simple to track all the layer attributes queried by + # quantization methods and various utilities. + # It would be good to eventually codify these in the FusedMoEConfig + # or some other config. self.__dict__.update(kwargs) self.quant_method = self._get_quant_method( diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index b665d76981ff..f33ff42e6c59 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -275,12 +275,12 @@ def __init__( self._shared_experts: SharedExperts | None = None if shared_experts is not None: + can_overlap = lambda: self._quant_method.mk_can_overlap_shared_experts self._shared_experts = SharedExperts( shared_experts, moe_config=moe_config, enable_dbo=enable_dbo, - # TODO: use lambda? - mk_can_overlap_shared_experts=routed_experts.quant_method.mk_can_overlap_shared_experts, + mk_can_overlap_shared_experts=can_overlap, ) # Needed for string -> MoERunner layer lookup in custom ops. @@ -315,10 +315,6 @@ def is_internal_router(self) -> bool: # TODO(bnell): Temporary hack. Get rid of this. def _replace_quant_method(self, quant_method: FusedMoEMethodBase): self.routed_experts.quant_method = quant_method - if self.shared_experts is not None: - self.shared_experts._mk_can_overlap_shared_experts = ( - quant_method.mk_can_overlap_shared_experts - ) def _maybe_fuse_gate_weights(self): """Fuse router and shared expert gate weights on first call. @@ -801,11 +797,11 @@ def _forward_impl( hidden_states, ) - # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX + ######################################################### # - # Old methods from FusedMoE layer + # Old methods from FusedMoE layer. Remove when possible. # - # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX + ######################################################### # Note: maybe_init_modular_kernel should only be called by # prepare_communication_buffer_for_model. diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index 52c3e21894f4..d44d60b19be5 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable from enum import IntEnum import torch @@ -41,7 +42,7 @@ def __init__( layer: torch.nn.Module, moe_config: FusedMoEConfig, enable_dbo: bool, - mk_can_overlap_shared_experts: bool, + mk_can_overlap_shared_experts: Callable[[], bool], ): super().__init__() @@ -54,7 +55,6 @@ def __init__( self._layer = layer self._moe_config = moe_config - # TODO: Make sure MK is constructed before this is accessed!!!!!!!!!!!!!! self._mk_can_overlap_shared_experts = mk_can_overlap_shared_experts # Allow disabling of the separate shared experts stream for @@ -89,7 +89,7 @@ def _determine_shared_experts_order( if self._disable_shared_experts_overlap: return SharedExpertsOrder.NO_OVERLAP - if self._mk_can_overlap_shared_experts: + if self._mk_can_overlap_shared_experts(): return SharedExpertsOrder.MK_INTERNAL_OVERLAPPED should_run_shared_in_aux_stream = ( diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index fb9d760b5e2d..bb200467fff6 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -158,6 +158,7 @@ def skip_forward_padding(self) -> bool: # so can skip the padding in the forward before applying the moe method return self.mxfp4_backend == Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8 + # TODO(bnell): move to MK/expert_class? @property def has_unpadded_output(self) -> bool: return self.mxfp4_backend in [ @@ -499,6 +500,7 @@ def skip_forward_padding(self) -> bool: # so can skip the padding in the forward before applying the moe method return self.mxfp4_backend == Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8 + # TODO(bnell): move to MK/expert_class? @property def has_unpadded_output(self) -> bool: return self.mxfp4_backend in [ diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index ef6782585750..eca04eed74b6 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -475,7 +475,7 @@ def get__quant_fp8_method() -> QuantFP8: return _quant_fp8_method -def get_marlin_input_dtype(prefix: str | None = None): # ? +def get_marlin_input_dtype(prefix: str | None = None): if envs.VLLM_MARLIN_INPUT_DTYPE is None: return elif envs.VLLM_MARLIN_INPUT_DTYPE.lower() == "int8": diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 0c726ae6325a..055892ecec32 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -380,7 +380,7 @@ def _load_weights_mxfp4( tp_rank_start = tp_rank * per_rank_intermediate_size tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, intermediate_size) - # Use centralized weight remapping for MoE expert parameters (Solution 7) + # Use centralized weight remapping for MoE expert parameters for name, weight in remap_moe_expert_weights(weights, params_dict): # Skip layers on other devices. if is_pp_missing_parameter(name, self): From dd8107f7b9cc485b35d95eff8d426d9d90fe44fc Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 29 May 2026 01:43:28 +0000 Subject: [PATCH 176/191] fix Signed-off-by: Bill Nell --- vllm/lora/layers/fused_moe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index c562dad33151..b7b9987341ad 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -419,8 +419,8 @@ def quant_method(self): return self.base_layer._quant_method @property - def runner(self): - return self.base_layer.runner + def runner(self) -> MoERunner: + return self.base_layer @property def is_internal_router(self) -> bool: From 1f26c1bf517f640f8ea608372596615d876ab4ea Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 29 May 2026 01:50:29 +0000 Subject: [PATCH 177/191] claude fix for test_gps_oss_tp2 Signed-off-by: Bill Nell --- tests/lora/test_gptoss_tp.py | 7 ++++--- vllm/lora/punica_wrapper/punica_gpu.py | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/lora/test_gptoss_tp.py b/tests/lora/test_gptoss_tp.py index 648660734655..847741c129cb 100644 --- a/tests/lora/test_gptoss_tp.py +++ b/tests/lora/test_gptoss_tp.py @@ -130,9 +130,10 @@ def test_gpt_oss_lora_tp2( gpu_memory_utilization=0.8, fully_sharded_loras=fully_sharded_loras, enable_expert_parallel=not fully_sharded_loras, - compilation_config=vllm.config.CompilationConfig( # Avoid OOM - cudagraph_specialize_lora=False, - ), + # compilation_config=vllm.config.CompilationConfig( # Avoid OOM + # cudagraph_specialize_lora=False, + # ), + enforce_eager=True, ) generate_and_test(llm, gptoss20b_lora_files, lora_id=1) diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index 87500ec3ec25..ccf95eb68477 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -570,7 +570,8 @@ def add_lora_w13( SPARSITY_FACTOR = 8 naive_block_assignment = ( - expert_map is None + not fully_sharded + and expert_map is None and num_tokens * top_k * SPARSITY_FACTOR <= local_num_experts * max_loras ) From d3832f94f2738e590954fdbe0451a18abed4cf40 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 29 May 2026 03:07:55 +0000 Subject: [PATCH 178/191] padding fix for cudagraph + test_gpt_oss_tp2 Signed-off-by: Bill Nell --- tests/lora/test_gptoss_tp.py | 7 +++---- .../layers/fused_moe/runner/moe_runner.py | 10 +++++----- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/lora/test_gptoss_tp.py b/tests/lora/test_gptoss_tp.py index 847741c129cb..70129671f0d5 100644 --- a/tests/lora/test_gptoss_tp.py +++ b/tests/lora/test_gptoss_tp.py @@ -130,10 +130,9 @@ def test_gpt_oss_lora_tp2( gpu_memory_utilization=0.8, fully_sharded_loras=fully_sharded_loras, enable_expert_parallel=not fully_sharded_loras, - # compilation_config=vllm.config.CompilationConfig( # Avoid OOM - # cudagraph_specialize_lora=False, - # ), - enforce_eager=True, + compilation_config=vllm.config.CompilationConfig( + cudagraph_specialize_lora=False, + ), ) generate_and_test(llm, gptoss20b_lora_files, lora_id=1) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index f33ff42e6c59..526cea7d3562 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -439,7 +439,7 @@ def _maybe_reduce_final_output( and (self.moe_config.tp_size > 1 or self.moe_config.ep_size > 1) and not self._fused_output_is_reduced ): - states = tensor_model_parallel_all_reduce(states.contiguous()) + states = tensor_model_parallel_all_reduce(states) return states @@ -657,9 +657,6 @@ def forward( # Extract outputs from result shared_output, fused_output = _unpack(result) - # Remember 40794. Double check tests/lora/test_gpt_oss.py::test_gpt_oss_tp2 - fused_output = fused_output[:, :og_hidden_dim] - # If combine kernel already reduced fused, reduce shared to match. # See note above re: the two all-reduce points. shared_output = self._maybe_reduce_shared_expert_output(shared_output) @@ -672,12 +669,15 @@ def forward( fused_output = self.apply_routed_output_transform(fused_output) if shared_output is not None: - result = shared_output + fused_output + result = shared_output + fused_output[:, :og_hidden_dim] else: result = fused_output result = self._maybe_reduce_final_output(result) + if shared_output is None: + result = result[..., :og_hidden_dim] + return self._maybe_add_zero_expert_output(result) @property From fdcab37577c21c4ebba330d2f916099a45597619 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 29 May 2026 12:33:50 +0000 Subject: [PATCH 179/191] fix renamed func Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/runner/moe_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index 526cea7d3562..f8b4c67857dd 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -819,7 +819,7 @@ def maybe_init_modular_kernel(self) -> None: self.routed_experts._ensure_moe_quant_config_init() # routing_tables only needed for round-robin expert placement with # DeepEP all2all backend. - routing_tables = self._maybe_init_expert_routing_tables() + routing_tables = self._expert_routing_tables() if isinstance(self.routed_experts.quant_method, FusedMoEModularMethod): base_quant_method = self.routed_experts.quant_method.old_quant_method From 66800ed63889e3b795aaa82b8cf0daf06ff18095 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 29 May 2026 19:35:47 +0000 Subject: [PATCH 180/191] cleanups + fixes Signed-off-by: Bill Nell --- benchmarks/kernels/benchmark_moe.py | 2 +- .../distributed/test_eplb_fused_moe_layer.py | 18 +++--- .../test_eplb_fused_moe_layer_dep_nvfp4.py | 1 - tests/kernels/moe/test_moe_layer.py | 3 - tests/kernels/moe/test_ocp_mx_moe.py | 2 +- tests/kernels/moe/test_trtllm_nvfp4_moe.py | 3 +- tests/kernels/moe/test_zero_expert_moe.py | 56 +++++++++++-------- .../test_routed_experts_capture.py | 5 +- vllm/lora/layers/fused_moe.py | 3 +- vllm/lora/model_manager.py | 1 - .../layers/fused_moe/eep_reconfigure.py | 2 +- vllm/model_executor/layers/fused_moe/layer.py | 2 - .../layers/fused_moe/routed_experts.py | 2 +- .../layers/fused_moe/router/base_router.py | 5 -- .../layers/fused_moe/runner/moe_runner.py | 2 +- .../fused_moe/runner/moe_runner_interface.py | 4 +- 16 files changed, 54 insertions(+), 57 deletions(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 4463a23772ee..c447ca2e8455 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -250,7 +250,7 @@ def run(): num_experts=num_experts, experts_per_token=topk, hidden_dim=hidden_size, - intermediate_size_per_partition=shard_intermediate_size, + intermediate_size=shard_intermediate_size, num_local_experts=num_experts, num_logical_experts=num_experts, activation=MoEActivation.SILU, diff --git a/tests/distributed/test_eplb_fused_moe_layer.py b/tests/distributed/test_eplb_fused_moe_layer.py index eacdb3abc363..080ba9583d30 100644 --- a/tests/distributed/test_eplb_fused_moe_layer.py +++ b/tests/distributed/test_eplb_fused_moe_layer.py @@ -75,9 +75,9 @@ def make_fused_moe_layer( intermediate_size=test_config.intermediate_size, prefix=f"dummy_layer_{layer_idx}", activation="silu", - is_act_and_mul=True, params_dtype=test_config.weight_dtype, ) + re = fml.routed_experts device = torch.device(f"cuda:{rank}") @@ -90,12 +90,12 @@ def make_fused_moe_layer( tensor_device=device, ) - assert isinstance(fml.w13_weight.data, torch.Tensor) - assert isinstance(fml.w2_weight.data, torch.Tensor) - fml.w13_weight.data = fml.w13_weight.data.to(device=device) - fml.w2_weight.data = fml.w2_weight.data.to(device=device) - w13_weight = fml.w13_weight.data - w2_weight = fml.w2_weight.data + assert isinstance(re.w13_weight.data, torch.Tensor) + assert isinstance(re.w2_weight.data, torch.Tensor) + re.w13_weight.data = re.w13_weight.data.to(device=device) + re.w2_weight.data = re.w2_weight.data.to(device=device) + w13_weight = re.w13_weight.data + w2_weight = re.w2_weight.data assert w13_weight.size(0) == test_config.num_local_experts for i in range(test_config.num_local_experts): g_i = rank * test_config.num_local_experts + i @@ -170,10 +170,10 @@ def block_quant_scales_shape( assert not w2_weight_scale_inv.is_contiguous() # Add scales to the parameter list - fml.w13_weight_scale_inv = torch.nn.Parameter( + re.w13_weight_scale_inv = torch.nn.Parameter( w13_weight_scale_inv, requires_grad=False ) - fml.w2_weight_scale_inv = torch.nn.Parameter( + re.w2_weight_scale_inv = torch.nn.Parameter( w2_weight_scale_inv, requires_grad=False ) diff --git a/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py index 9ab785af3135..8035651d5b55 100644 --- a/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py +++ b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py @@ -59,7 +59,6 @@ def make_fused_moe_layer( intermediate_size=test_config.intermediate_size, prefix=f"dummy_layer_{layer_idx}", activation="silu", - is_act_and_mul=True, params_dtype=torch.bfloat16, quant_config=quant_config, ) diff --git a/tests/kernels/moe/test_moe_layer.py b/tests/kernels/moe/test_moe_layer.py index cf2611c9e20e..64c38c17616b 100644 --- a/tests/kernels/moe/test_moe_layer.py +++ b/tests/kernels/moe/test_moe_layer.py @@ -1500,9 +1500,6 @@ def _run_one_config( is_sequence_parallel=is_sequence_parallel, ) - # if moe_layer._expert_map is not None: - # moe_layer._expert_map = moe_layer._expert_map.to(device) - num_tokens = m # num_tokens_across_dp should have one entry per DP group, not per # total rank. diff --git a/tests/kernels/moe/test_ocp_mx_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py index 8ed7757f6553..5c52c8af6a82 100644 --- a/tests/kernels/moe/test_ocp_mx_moe.py +++ b/tests/kernels/moe/test_ocp_mx_moe.py @@ -1322,7 +1322,7 @@ def test_rocm_mxfp4_moe_oracle( num_experts=num_experts, experts_per_token=topk, hidden_dim=hidden_size, - intermediate_size_per_partition=intermediate_size, + intermediate_size=intermediate_size, num_local_experts=num_experts, num_logical_experts=num_experts, moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), diff --git a/tests/kernels/moe/test_trtllm_nvfp4_moe.py b/tests/kernels/moe/test_trtllm_nvfp4_moe.py index 4b4c3e712be5..2653b711d9fc 100644 --- a/tests/kernels/moe/test_trtllm_nvfp4_moe.py +++ b/tests/kernels/moe/test_trtllm_nvfp4_moe.py @@ -164,14 +164,13 @@ def test_trtllm_fp4_moe_no_graph( num_experts=e, experts_per_token=topk, hidden_dim=k, - intermediate_size_per_partition=n, + intermediate_size=n, num_local_experts=e, num_logical_experts=e, activation=activation, device="cuda", moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), in_dtype=dtype, - is_act_and_mul=is_gated_act, routing_method=RoutingMethodType.TopK, max_num_tokens=next_power_of_2(m), ) diff --git a/tests/kernels/moe/test_zero_expert_moe.py b/tests/kernels/moe/test_zero_expert_moe.py index f10459aa5192..71e33b7dfacc 100644 --- a/tests/kernels/moe/test_zero_expert_moe.py +++ b/tests/kernels/moe/test_zero_expert_moe.py @@ -59,7 +59,7 @@ def zero_expert_moe(dist_init, default_vllm_config): scoring_func="softmax", ).cuda() - layer.quant_method.process_weights_after_loading(layer) + layer._quant_method.process_weights_after_loading(layer.routed_experts) yield layer, vllm_config @@ -73,12 +73,12 @@ def test_zero_expert_moe_router_is_zero_expert_router(zero_expert_moe, num_token ) -@pytest.mark.parametrize("num_tokens", [1, 32]) -def test_zero_expert_moe_no_custom_routing_fn(zero_expert_moe, num_tokens): - """Verify that custom_routing_function is not set (routing is handled - by ZeroExpertRouter, not a memoizing closure).""" - layer, _ = zero_expert_moe - assert layer.custom_routing_function is None +# @pytest.mark.parametrize("num_tokens", [1, 32]) +# def test_zero_expert_moe_no_custom_routing_fn(zero_expert_moe, num_tokens): +# """Verify that custom_routing_function is not set (routing is handled +# by ZeroExpertRouter, not a memoizing closure).""" +# layer, _ = zero_expert_moe +# #assert layer.custom_routing_function is None @pytest.mark.parametrize("num_tokens", [1, 32]) @@ -86,7 +86,7 @@ def test_zero_expert_moe_forward(zero_expert_moe, num_tokens): """Run a forward pass through FusedMoE with zero experts and verify output shape.""" layer, vllm_config = zero_expert_moe - hidden_size = layer.hidden_size + hidden_size = layer.routed_experts.hidden_size num_experts = 4 zero_expert_num = 1 total_experts = num_experts + zero_expert_num @@ -135,7 +135,10 @@ def test_zero_expert_moe_output_decomposition(zero_expert_moe, num_tokens): total_experts = num_experts + zero_expert_num hidden_states = torch.randn( - num_tokens, layer.hidden_size, dtype=torch.bfloat16, device="cuda" + num_tokens, + layer.routed_experts.hidden_size, + dtype=torch.bfloat16, + device="cuda", ) router_logits = torch.randn( num_tokens, total_experts, dtype=torch.float32, device="cuda" @@ -153,20 +156,26 @@ def test_zero_expert_moe_output_decomposition(zero_expert_moe, num_tokens): # experts. Use a separate prefix to avoid collision. plain_layer = FusedMoE( num_experts=num_experts, - top_k=layer.top_k, - hidden_size=layer.hidden_size, - intermediate_size=layer.intermediate_size_per_partition, + top_k=layer.routed_experts.top_k, + hidden_size=layer.routed_experts.hidden_size, + intermediate_size=layer.routed_experts.intermediate_size_per_partition, params_dtype=torch.bfloat16, prefix="test_zero_expert_moe_plain", renormalize=False, scoring_func="softmax", - e_score_correction_bias=layer.e_score_correction_bias, + e_score_correction_bias=layer.routed_experts.e_score_correction_bias, ).cuda() # Share weights from the zero expert layer. - plain_layer.w13_weight.data.copy_(layer.w13_weight.data) - plain_layer.w2_weight.data.copy_(layer.w2_weight.data) - plain_layer.quant_method.process_weights_after_loading(plain_layer) + plain_layer.routed_experts.w13_weight.data.copy_( + layer.routed_experts.w13_weight.data + ) + plain_layer.routed_experts.w2_weight.data.copy_( + layer.routed_experts.w2_weight.data + ) + plain_layer._quant_method.process_weights_after_loading( + plain_layer.routed_experts + ) # Compute routing via the ZeroExpertRouter. This produces masked # topk_weights/topk_ids (zero expert entries have weight=0, id=0) @@ -178,8 +187,8 @@ def test_zero_expert_moe_output_decomposition(zero_expert_moe, num_tokens): # Compute real expert output using the plain layer with the masked # routing from the ZeroExpertRouter. - real_output = plain_layer.quant_method.apply( - layer=plain_layer, + real_output = plain_layer._quant_method.apply( + layer=plain_layer.routed_experts, x=hidden_states, topk_weights=topk_weights, topk_ids=topk_ids, @@ -199,8 +208,8 @@ def test_zero_expert_moe_output_decomposition(zero_expert_moe, num_tokens): torch.testing.assert_close( full_output, expected, - atol=0, - rtol=0, + atol=4e-3, + rtol=4e-3, msg="FusedMoE output should equal plain FusedMoE output " "plus zero expert contribution", ) @@ -221,7 +230,10 @@ def test_zero_expert_moe_zero_expert_is_identity(zero_expert_moe, num_tokens): total_experts = num_experts + zero_expert_num hidden_states = torch.randn( - num_tokens, layer.hidden_size, dtype=torch.bfloat16, device="cuda" + num_tokens, + layer.routed_experts.hidden_size, + dtype=torch.bfloat16, + device="cuda", ) # Strongly bias toward the zero expert (index 4). router_logits = torch.full( @@ -246,7 +258,7 @@ def test_zero_expert_moe_zero_expert_is_identity(zero_expert_moe, num_tokens): hidden_states=hidden_states, gating_output=router_logits, e_score_correction_bias=layer.router.e_score_correction_bias.data, - topk=layer.top_k, + topk=layer.routed_experts.top_k, renormalize=layer.router.renormalize, scoring_func=layer.router.scoring_func, ) diff --git a/tests/model_executor/test_routed_experts_capture.py b/tests/model_executor/test_routed_experts_capture.py index 62e1cb91bea2..d1a542396e6e 100644 --- a/tests/model_executor/test_routed_experts_capture.py +++ b/tests/model_executor/test_routed_experts_capture.py @@ -63,7 +63,6 @@ def _make_router(eplb_state: EplbLayerState | None = None) -> DummyRouter: top_k=2, global_num_experts=16, eplb_state=eplb_state, - indices_type_getter=None, ) @@ -135,7 +134,7 @@ def capture(self, layer_id, topk_ids): # Patch the runtime import inside _bind_routed_experts_capturer. import vllm.model_executor.layers.fused_moe.layer as fused_moe_layer - monkeypatch.setattr(fused_moe_layer, "FusedMoE", DummyFusedMoE) + monkeypatch.setattr(fused_moe_layer, "MoERunner", DummyFusedMoE) dummy_self = types.SimpleNamespace( compilation_config=types.SimpleNamespace( @@ -174,7 +173,7 @@ def capture(self, layer_id, topk_ids): import vllm.model_executor.layers.fused_moe.layer as fused_moe_layer - monkeypatch.setattr(fused_moe_layer, "FusedMoE", DummyFusedMoE) + monkeypatch.setattr(fused_moe_layer, "MoERunner", DummyFusedMoE) dummy_self = types.SimpleNamespace( compilation_config=types.SimpleNamespace( diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index b7b9987341ad..7b400bc5e978 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -128,8 +128,7 @@ def _build_lora_context(self): w2_lora_b_stacked=self.w2_lora_b_stacked, adapter_enabled=self.adapter_enabled, max_loras=self.max_loras, - # top_k=self.moe_config.experts_per_token, - top_k=self.base_layer.routed_experts.top_k, + top_k=self.moe_config.experts_per_token, w13_num_slices=self._w13_slices, fully_sharded=self.fully_sharded, tp_rank=self.tp_rank, diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index 4cdda050ae64..ea4895c4896b 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -1097,7 +1097,6 @@ def _build_moe_ep_load_spec(self) -> MoEEPLoadSpec | None: ) if module is None: return None - # base = module.base_layer return MoEEPLoadSpec( ep_rank=module.ep_rank, local_num_experts=module.local_num_experts, diff --git a/vllm/model_executor/layers/fused_moe/eep_reconfigure.py b/vllm/model_executor/layers/fused_moe/eep_reconfigure.py index 60d2e62ba958..2c6c3f6aa40a 100644 --- a/vllm/model_executor/layers/fused_moe/eep_reconfigure.py +++ b/vllm/model_executor/layers/fused_moe/eep_reconfigure.py @@ -61,7 +61,7 @@ def make_eep_staged_quant_method( module: "MoERunner", moe_config: FusedMoEConfig, ) -> FusedMoEMethodBase | None: - quant_method = module.quant_method + quant_method = module._quant_method if not quant_method.supports_internal_mk: return None if getattr(quant_method, "wraps_legacy_quant_method", False): diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 19d6893d335a..11f2c2429bf2 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -52,8 +52,6 @@ def make_parallel_config( ) dp_size_ = dp_size if dp_size is not None else get_dp_group().world_size pcp_size_ = pcp_size if pcp_size is not None else get_pcp_group().world_size - - is_sequence_parallel = is_sequence_parallel sp_size = tp_size_ if is_sequence_parallel else 1 moe_parallel_config = FusedMoEParallelConfig.make( diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index b8279f183958..759beea89dd0 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -240,7 +240,7 @@ def update_expert_map(self): # Note: ExpertMapManager.update() recalculates expert maps and # reinitializes routing tables internally. self.expert_map_manager.update( - self.moe_parallel_config, + self.moe_config.moe_parallel_config, global_num_experts=self.global_num_experts, ) diff --git a/vllm/model_executor/layers/fused_moe/router/base_router.py b/vllm/model_executor/layers/fused_moe/router/base_router.py index 91c1979be882..4ba855b645fe 100644 --- a/vllm/model_executor/layers/fused_moe/router/base_router.py +++ b/vllm/model_executor/layers/fused_moe/router/base_router.py @@ -151,11 +151,6 @@ def __init__( eplb_state: EplbLayerState | None = None, ): """ - Note: the indices dtype might not be available at router construction - time, so we need to supply a callback to get it at runtime. This is - because the indices type is supplied by modular kernels which are - created after MoE layer/router construction. - Args: top_k: Number of experts to select per token global_num_experts: Total number of experts diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index f8b4c67857dd..3efdda397746 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -835,7 +835,7 @@ def maybe_init_modular_kernel(self) -> None: ) self._replace_quant_method( FusedMoEModularMethod.make( - self, + self.routed_experts, base_quant_method, prepare_finalize, ) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner_interface.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner_interface.py index ddb7c596a180..cc79095ead8a 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner_interface.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner_interface.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from abc import abstractmethod +from abc import ABC, abstractmethod from collections.abc import Iterable import torch @@ -16,7 +16,7 @@ ) -class MoERunnerInterface(PluggableLayer): +class MoERunnerInterface(PluggableLayer, ABC): """ Abstract base class for Mixture of Experts (MoE) runners. From df1a74348a3b49207cd3fa604b014fbf346792c8 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 29 May 2026 20:37:36 +0000 Subject: [PATCH 181/191] make extra parameters explicit + test cleanups Signed-off-by: Bill Nell --- .../distributed/test_eplb_fused_moe_layer.py | 20 ++++++-- .../test_eplb_fused_moe_layer_dep_nvfp4.py | 47 ++++++++++++------- vllm/model_executor/layers/fused_moe/layer.py | 4 +- .../layers/fused_moe/routed_experts.py | 35 +++++++++++--- 4 files changed, 76 insertions(+), 30 deletions(-) diff --git a/tests/distributed/test_eplb_fused_moe_layer.py b/tests/distributed/test_eplb_fused_moe_layer.py index 080ba9583d30..3370f1d3785e 100644 --- a/tests/distributed/test_eplb_fused_moe_layer.py +++ b/tests/distributed/test_eplb_fused_moe_layer.py @@ -8,10 +8,12 @@ import pytest import torch -from vllm.config import VllmConfig, set_current_vllm_config +from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.distributed.eplb.eplb_communicator import create_eplb_communicator from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace from vllm.distributed.parallel_state import ( ensure_model_parallel_initialized, + get_eplb_group, get_tp_group, ) from vllm.model_executor.layers.fused_moe.layer import FusedMoE @@ -185,9 +187,12 @@ def _test_eplb_fml(env, world_size: int, test_config: TestConfig): # to expert parallel) set_env_vars_and_device(env) - vllm_config = VllmConfig() - vllm_config.parallel_config.tensor_parallel_size = world_size - vllm_config.parallel_config.enable_expert_parallel = True + parallel_config = ParallelConfig( + tensor_parallel_size=world_size, + enable_expert_parallel=True, + enable_eplb=True, + ) + vllm_config = VllmConfig(parallel_config=parallel_config) with set_current_vllm_config(vllm_config): ensure_model_parallel_initialized( @@ -213,12 +218,19 @@ def _test_eplb_fml(env, world_size: int, test_config: TestConfig): for lidx in range(test_config.num_layers): shuffled_indices[lidx] = torch.randperm(test_config.num_experts) + communicator = create_eplb_communicator( + group_coordinator=get_eplb_group(), + backend=vllm_config.parallel_config.eplb_config.communicator, + expert_weights=rank_expert_weights[0], + ) + rearrange_expert_weights_inplace( indices, shuffled_indices, rank_expert_weights, ep_group, is_profile=False, + communicator=communicator, ) num_local_experts = test_config.num_local_experts diff --git a/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py index 8035651d5b55..271a6b13e331 100644 --- a/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py +++ b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py @@ -9,12 +9,14 @@ import torch from tests.kernels.moe.utils import make_test_quant_config -from vllm.config import VllmConfig, set_current_vllm_config +from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.distributed.eplb.eplb_communicator import create_eplb_communicator from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace from vllm.distributed.parallel_state import ( ensure_model_parallel_initialized, get_dp_group, + get_eplb_group, ) from vllm.forward_context import set_forward_context from vllm.model_executor.layers.fused_moe.layer import FusedMoE @@ -84,21 +86,22 @@ def make_fused_moe_layer( per_act_token_quant=False, ) - fml.w13_weight.data = w1_q - fml.w2_weight.data = w2_q + re = fml.routed_experts + re.w13_weight.data = w1_q + re.w2_weight.data = w2_q - fml.w2_input_scale.data = torch.randn_like(fml.w2_input_scale.data) / 5 - fml.w13_input_scale.data = torch.randn_like(fml.w13_input_scale.data) / 5 - fml.w2_weight_scale_2.data = torch.randn_like(fml.w2_weight_scale_2.data) / 5 - fml.w13_weight_scale_2.data = torch.randn_like(fml.w13_weight_scale_2.data) / 5 - fml.w2_weight_scale.data = ( - torch.randn(fml.w2_weight_scale.data.shape, device=device) / 5 - ).to(fml.w2_weight_scale.data.dtype) - fml.w13_weight_scale.data = ( - torch.randn(fml.w13_weight_scale.data.shape, device=device) / 5 - ).to(fml.w13_weight_scale.data.dtype) + re.w2_input_scale.data = torch.randn_like(re.w2_input_scale.data) / 5 + re.w13_input_scale.data = torch.randn_like(re.w13_input_scale.data) / 5 + re.w2_weight_scale_2.data = torch.randn_like(re.w2_weight_scale_2.data) / 5 + re.w13_weight_scale_2.data = torch.randn_like(re.w13_weight_scale_2.data) / 5 + re.w2_weight_scale.data = ( + torch.randn(re.w2_weight_scale.data.shape, device=device) / 5 + ).to(re.w2_weight_scale.data.dtype) + re.w13_weight_scale.data = ( + torch.randn(re.w13_weight_scale.data.shape, device=device) / 5 + ).to(re.w13_weight_scale.data.dtype) - nvfp4_fused_moe.process_weights_after_loading(fml) + nvfp4_fused_moe.process_weights_after_loading(fml.routed_experts) fml.maybe_init_modular_kernel() @@ -108,9 +111,12 @@ def make_fused_moe_layer( def _test_eplb_fml(env, world_size: int, test_config: TestConfig): set_env_vars_and_device(env) - vllm_config = VllmConfig() - vllm_config.parallel_config.data_parallel_size = world_size - vllm_config.parallel_config.enable_expert_parallel = True + parallel_config = ParallelConfig( + data_parallel_size=world_size, + enable_expert_parallel=True, + enable_eplb=True, + ) + vllm_config = VllmConfig(parallel_config=parallel_config) with set_current_vllm_config(vllm_config): ensure_model_parallel_initialized( @@ -170,12 +176,19 @@ def _test_eplb_fml(env, world_size: int, test_config: TestConfig): for lidx in range(test_config.num_layers): shuffled_indices[lidx] = torch.randperm(test_config.num_experts) + communicator = create_eplb_communicator( + group_coordinator=get_eplb_group(), + backend=vllm_config.parallel_config.eplb_config.communicator, + expert_weights=rank_expert_weights[0], + ) + rearrange_expert_weights_inplace( indices, shuffled_indices, rank_expert_weights, ep_group, is_profile=False, + communicator=communicator, ) num_global_experts = test_config.num_experts diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 11f2c2429bf2..5a0e443b0e5a 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -338,7 +338,8 @@ def FusedMoE( expert_map_manager=expert_map_manager, expert_mapping=expert_mapping, # Extra params that are needed by quant_methods, pass along for now - top_k=top_k, # TODO: can get from moe_config + # Prefer getting these from other sources, e.g. moe_config or + # router object use_grouped_topk=use_grouped_topk, num_expert_group=num_expert_group, topk_group=topk_group, @@ -349,7 +350,6 @@ def FusedMoE( # TODO get from router? needs to be truncated? e_score_correction_bias=e_score_correction_bias, apply_router_weight_on_input=apply_router_weight_on_input, - activation=moe_activation, **routed_experts_args if routed_experts_args is not None else {}, ) diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index 759beea89dd0..8c79e0c928f6 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Iterable +from collections.abc import Callable, Iterable from enum import Enum from typing import TYPE_CHECKING, Any, Literal, cast, overload @@ -61,7 +61,20 @@ def __init__( quant_config: QuantizationConfig | None, expert_map_manager: ExpertMapManager, expert_mapping: list[tuple[str, str, int, str]] | None = None, - **kwargs, + # + # Extra params that are needed by quant_methods, pass along for now + # Prefer getting these from other sources, e.g. moe_config or + # router object + # + use_grouped_topk: bool = False, + num_expert_group: int | None = None, + topk_group: int | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + swiglu_limit: float | None = None, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, ): super().__init__() self.layer_name = layer_name @@ -78,12 +91,20 @@ def __init__( self.rocm_aiter_fmoe_enabled = moe_config.rocm_aiter_fmoe_enabled - # Set any remaining kwargs as attributes. This is necessary because - # it is not simple to track all the layer attributes queried by - # quantization methods and various utilities. - # It would be good to eventually codify these in the FusedMoEConfig + # It would be good to eventually codify these in FusedMoEConfig # or some other config. - self.__dict__.update(kwargs) + self.top_k = self.moe_config.experts_per_token + self.activation = self.moe_config.activation + self.use_grouped_topk = use_grouped_topk + self.num_expert_group = num_expert_group + self.topk_group = topk_group + self.custom_routing_function = custom_routing_function + self.scoring_func = scoring_func + self.routed_scaling_factor = routed_scaling_factor + self.swiglu_limit = swiglu_limit + self.e_score_correction_bias = e_score_correction_bias + self.apply_router_weight_on_input = apply_router_weight_on_input + # End random parameters self.quant_method = self._get_quant_method( self.layer_name, From 6c4e0a6048d7c554779f192e18390fb37d14de11 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sat, 30 May 2026 00:33:10 +0000 Subject: [PATCH 182/191] cleaner truncation handling Signed-off-by: Bill Nell --- .../layers/fused_moe/runner/moe_runner.py | 37 ++++++++++++------- .../layers/quantization/utils/marlin_utils.py | 7 +++- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index 3efdda397746..24ab7d8c4bf3 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -423,6 +423,7 @@ def _maybe_reduce_shared_expert_output( def _maybe_reduce_final_output( self, states: torch.Tensor, + trunc_size: int | None, ) -> torch.Tensor: """All-reduce the combined output if needed. @@ -441,7 +442,7 @@ def _maybe_reduce_final_output( ): states = tensor_model_parallel_all_reduce(states) - return states + return states[..., :trunc_size] if trunc_size is not None else states def _encode_layer_name(self) -> str | LayerName: if _USE_LAYERNAME: @@ -458,7 +459,7 @@ def _maybe_pad_hidden_states( self, shared_experts_input: torch.Tensor | None, hidden_states: torch.Tensor, - ) -> tuple[torch.Tensor, int]: + ) -> tuple[torch.Tensor, int | None, int | None]: """Pad hidden_states to moe_config.hidden_dim and compute the original dimension for later truncation. @@ -470,11 +471,12 @@ def _maybe_pad_hidden_states( shared_experts_hidden_dim = ( shared_experts_input.shape[-1] if shared_experts_input is not None else 0 ) - transformed_hidden_dim = hidden_states.shape[-1] + transformed_hidden_dim: int | None = hidden_states.shape[-1] if ( not self._quant_method.skip_forward_padding and self.moe_config.hidden_dim != transformed_hidden_dim ): + assert transformed_hidden_dim is not None hidden_states = F.pad( hidden_states, (0, self.moe_config.hidden_dim - transformed_hidden_dim), @@ -482,12 +484,17 @@ def _maybe_pad_hidden_states( value=0.0, ) + if transformed_hidden_dim == hidden_states.shape[-1]: + transformed_hidden_dim = None + if self.routed_output_transform is not None and shared_experts_hidden_dim > 0: - orig_hidden_dims = shared_experts_hidden_dim + pre_xform_trunc_size = transformed_hidden_dim + post_xform_trunc_size = shared_experts_hidden_dim else: - orig_hidden_dims = transformed_hidden_dim + pre_xform_trunc_size = None + post_xform_trunc_size = transformed_hidden_dim - return hidden_states, orig_hidden_dims + return hidden_states, pre_xform_trunc_size, post_xform_trunc_size def _maybe_apply_shared_experts( self, @@ -629,9 +636,11 @@ def forward( # so routed output can be trimmed before # shared+routed add / latent up proj if needed. - hidden_states, og_hidden_dim = self._maybe_pad_hidden_states( - shared_experts_input, - hidden_states, + hidden_states, og_hidden_dim_pre_xform, og_hidden_dim_post_xform = ( + self._maybe_pad_hidden_states( + shared_experts_input, + hidden_states, + ) ) result = self._forward_entry( @@ -657,6 +666,9 @@ def forward( # Extract outputs from result shared_output, fused_output = _unpack(result) + if og_hidden_dim_pre_xform is not None: + fused_output = fused_output[..., :og_hidden_dim_pre_xform] + # If combine kernel already reduced fused, reduce shared to match. # See note above re: the two all-reduce points. shared_output = self._maybe_reduce_shared_expert_output(shared_output) @@ -669,14 +681,11 @@ def forward( fused_output = self.apply_routed_output_transform(fused_output) if shared_output is not None: - result = shared_output + fused_output[:, :og_hidden_dim] + result = shared_output + fused_output else: result = fused_output - result = self._maybe_reduce_final_output(result) - - if shared_output is None: - result = result[..., :og_hidden_dim] + result = self._maybe_reduce_final_output(result, og_hidden_dim_post_xform) return self._maybe_add_zero_expert_output(result) diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index eca04eed74b6..8c66b39b4923 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -234,7 +234,12 @@ def check_moe_marlin_supports_layer(layer: RoutedExperts, group_size: int) -> bo if current_platform.is_rocm(): return False hidden_size = layer.hidden_size - intermediate_size_per_partition = layer.intermediate_size_per_partition + # Note: The layer has not performed rounding on intermediate_size's at this + # point. Use the unpadded size which won't change. + intermediate_size_per_partition = ( + layer.moe_config.intermediate_size_per_partition_unpadded + ) + assert intermediate_size_per_partition is not None # apply_router_weight_on_input is not supported for moe marlin supports_router_weight = not layer.apply_router_weight_on_input From 9d98e3f24b14d172cc9890e236f9b6aa3a185729 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sat, 30 May 2026 00:37:20 +0000 Subject: [PATCH 183/191] add comment Signed-off-by: Bill Nell --- .../layers/fused_moe/runner/moe_runner.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index 24ab7d8c4bf3..ba41ae82a4da 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -484,6 +484,22 @@ def _maybe_pad_hidden_states( value=0.0, ) + # Truncation sizes for stripping kernel padding from the output. + # None means no truncation needed (no padding was applied). + # + # Two truncation points exist in forward(): + # pre_xform: applied to fused_output BEFORE routed_output_transform + # post_xform: applied to the final result AFTER all-reduce + # + # Latent MoE with shared experts (NemotronH): + # - pre_xform strips padding from the latent dim so + # routed_output_transform receives the correct input size + # - post_xform truncates to shared_experts_hidden_dim (full hidden) + # after shared + routed outputs are combined and all-reduced + # + # Standard MoE / MoE without transforms (GPT-OSS, Mixtral): + # - pre_xform is None (no early truncation) + # - post_xform strips padding after all-reduce (or None if unpadded) if transformed_hidden_dim == hidden_states.shape[-1]: transformed_hidden_dim = None From 42a991aa1ffa5be21f5807e10a6c8e83b68b72ba Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 1 Jun 2026 21:30:32 +0000 Subject: [PATCH 184/191] fix quantization tests failures Signed-off-by: Bill Nell --- tests/quantization/test_compressed_tensors.py | 20 +++++++++++-------- tests/quantization/test_fp8.py | 1 + tests/quantization/test_online.py | 2 +- tests/quantization/test_quark.py | 4 ++-- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 2165361da678..ac9f7b71e492 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -525,20 +525,22 @@ def test_compressed_tensors_moe_ignore_with_model(vllm_runner): with vllm_runner(model_path, enforce_eager=True) as llm: def check_model(model): - from vllm.model_executor.layers.fused_moe import FusedMoE + from vllm.model_executor.layers.fused_moe import MoERunner from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import ( # noqa: E501 CompressedTensorsMoEMethod, ) # Check layer 0 MoE (should be quantized) layer_quantized = model.model.layers[0].mlp.experts - assert isinstance(layer_quantized, FusedMoE) - assert isinstance(layer_quantized.quant_method, CompressedTensorsMoEMethod) + assert isinstance(layer_quantized, MoERunner) + assert isinstance(layer_quantized._quant_method, CompressedTensorsMoEMethod) # Check layer 10 MoE (should be unquantized + ignored) layer_unquantized = model.model.layers[3].mlp.experts - assert isinstance(layer_unquantized, FusedMoE) - assert isinstance(layer_unquantized.quant_method, UnquantizedFusedMoEMethod) + assert isinstance(layer_unquantized, MoERunner) + assert isinstance( + layer_unquantized._quant_method, UnquantizedFusedMoEMethod + ) llm.apply_model(check_model) @@ -670,7 +672,7 @@ def test_compressed_tensors_mxfp8_moe_setup(vllm_runner): ) as llm: def check_model(model): - from vllm.model_executor.layers.fused_moe import FusedMoE + from vllm.model_executor.layers.fused_moe import MoERunner from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.compressed_tensors_moe_w8a8_mxfp8 import ( # noqa: E501 CompressedTensorsW8A8Mxfp8MoEMethod, ) @@ -682,8 +684,10 @@ def check_model(model): assert isinstance(qkv.scheme, CompressedTensorsW8A8Mxfp8) experts = layer.mlp.experts - assert isinstance(experts, FusedMoE) - assert isinstance(experts.quant_method, CompressedTensorsW8A8Mxfp8MoEMethod) + assert isinstance(experts, MoERunner) + assert isinstance( + experts._quant_method, CompressedTensorsW8A8Mxfp8MoEMethod + ) llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=4) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index b93d34afbb9b..7c92c3dc15e2 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -440,6 +440,7 @@ def test_fp8_reloading( hidden_size=1, intermediate_size=1, ) + layer = layer.routed_experts method = method_cls(config, layer) method.create_weights( layer=layer, diff --git a/tests/quantization/test_online.py b/tests/quantization/test_online.py index 0254da79e101..995df7946008 100644 --- a/tests/quantization/test_online.py +++ b/tests/quantization/test_online.py @@ -115,7 +115,7 @@ def check_model(model): # because of how we craft the test case inputs assert isinstance(o_proj.quant_method, expected_linear_cls) if moe is not None: - assert isinstance(moe.quant_method, expected_moe_cls) + assert isinstance(moe._quant_method, expected_moe_cls) if current_platform.is_cuda(): assert o_proj.weight.dtype == torch.float8_e4m3fn diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index fe474d7e0cc8..56922331092c 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -146,8 +146,8 @@ def check_model(model): layer = model.model.layers[0] # MoE experts should use QuarkW8A8Int8MoEMethod moe = layer.mlp.experts - assert isinstance(moe.quant_method, QuarkW8A8Int8MoEMethod), ( - f"Expected QuarkW8A8Int8MoEMethod, got {type(moe.quant_method)}" + assert isinstance(moe._quant_method, QuarkW8A8Int8MoEMethod), ( + f"Expected QuarkW8A8Int8MoEMethod, got {type(moe._quant_method)}" ) # Non-MoE linear layers should use QuarkW8A8Int8 qkv_proj = layer.self_attn.qkv_proj From aa5f5a326a76e4b0647be46b4a1b8f9093a8b18d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 1 Jun 2026 21:40:55 +0000 Subject: [PATCH 185/191] fix weight loader test Signed-off-by: Bill Nell --- .../moe/test_moe_weight_loading_padded.py | 74 +++++++++---------- .../layers/fused_moe/routed_experts.py | 43 +++++++---- 2 files changed, 65 insertions(+), 52 deletions(-) diff --git a/tests/kernels/moe/test_moe_weight_loading_padded.py b/tests/kernels/moe/test_moe_weight_loading_padded.py index abe473879f1d..d4939c79e5a7 100644 --- a/tests/kernels/moe/test_moe_weight_loading_padded.py +++ b/tests/kernels/moe/test_moe_weight_loading_padded.py @@ -12,7 +12,7 @@ import pytest import torch -from vllm.model_executor.layers.fused_moe.layer import FusedMoE +from vllm.model_executor.layers.fused_moe.routed_experts import RoutedExperts class TestGetHiddenDim: @@ -20,45 +20,45 @@ class TestGetHiddenDim: def test_2d_non_transposed_w2(self): # w2: shard_dim=1 (intermediate), hidden=0 - assert FusedMoE._get_hidden_dim(shard_dim=1, ndim=2) == 0 + assert RoutedExperts._get_hidden_dim(shard_dim=1, ndim=2) == 0 def test_2d_non_transposed_w13(self): # w1/w3: shard_dim=0 (intermediate), hidden=1 - assert FusedMoE._get_hidden_dim(shard_dim=0, ndim=2) == 1 + assert RoutedExperts._get_hidden_dim(shard_dim=0, ndim=2) == 1 def test_2d_transposed_w2(self): # transposed w2: shard_dim=0, hidden=1 - assert FusedMoE._get_hidden_dim(shard_dim=0, ndim=2) == 1 + assert RoutedExperts._get_hidden_dim(shard_dim=0, ndim=2) == 1 def test_2d_transposed_w13(self): # transposed w1/w3: shard_dim=1, hidden=0 - assert FusedMoE._get_hidden_dim(shard_dim=1, ndim=2) == 0 + assert RoutedExperts._get_hidden_dim(shard_dim=1, ndim=2) == 0 def test_3d_non_transposed_w2(self): # 3D w2: shard_dim=2, hidden=1 - assert FusedMoE._get_hidden_dim(shard_dim=2, ndim=3) == 1 + assert RoutedExperts._get_hidden_dim(shard_dim=2, ndim=3) == 1 def test_3d_non_transposed_w13(self): # 3D w1/w3: shard_dim=1, hidden=2 - assert FusedMoE._get_hidden_dim(shard_dim=1, ndim=3) == 2 + assert RoutedExperts._get_hidden_dim(shard_dim=1, ndim=3) == 2 def test_3d_transposed_w2(self): # transposed 3D w2: shard_dim=1, hidden=2 - assert FusedMoE._get_hidden_dim(shard_dim=1, ndim=3) == 2 + assert RoutedExperts._get_hidden_dim(shard_dim=1, ndim=3) == 2 def test_3d_transposed_w13(self): # transposed 3D w1/w3: shard_dim=2, hidden=1 - assert FusedMoE._get_hidden_dim(shard_dim=2, ndim=3) == 1 + assert RoutedExperts._get_hidden_dim(shard_dim=2, ndim=3) == 1 def test_1d_returns_zero(self): # 1D per-channel scales: always returns 0 - assert FusedMoE._get_hidden_dim(shard_dim=0, ndim=1) == 0 - assert FusedMoE._get_hidden_dim(shard_dim=1, ndim=1) == 0 + assert RoutedExperts._get_hidden_dim(shard_dim=0, ndim=1) == 0 + assert RoutedExperts._get_hidden_dim(shard_dim=1, ndim=1) == 0 def test_invalid_shard_dim_raises(self): # shard_dim outside the data dimensions should raise with pytest.raises(ValueError, match="not a valid data dimension"): - FusedMoE._get_hidden_dim(shard_dim=0, ndim=3) + RoutedExperts._get_hidden_dim(shard_dim=0, ndim=3) class TestNarrowExpertDataForPadding: @@ -67,7 +67,7 @@ class TestNarrowExpertDataForPadding: def test_no_narrowing_when_shapes_match(self): expert_data = torch.zeros(1024, 1024) loaded_weight = torch.randn(1024, 1024) - result = FusedMoE._narrow_expert_data_for_padding( + result = RoutedExperts._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=0 ) assert result.shape == loaded_weight.shape @@ -77,7 +77,7 @@ def test_narrow_w2_hidden_dim(self): # w2: (hidden_size, intermediate_size) - hidden_size padded at dim 0 expert_data = torch.zeros(3072, 1024) loaded_weight = torch.randn(2688, 1024) - result = FusedMoE._narrow_expert_data_for_padding( + result = RoutedExperts._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=0 ) assert result.shape == (2688, 1024) @@ -86,7 +86,7 @@ def test_narrow_w13_hidden_dim(self): # w1/w3: (intermediate_size, hidden_size) - hidden_size padded at dim 1 expert_data = torch.zeros(2048, 3072) loaded_weight = torch.randn(2048, 2688) - result = FusedMoE._narrow_expert_data_for_padding( + result = RoutedExperts._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=1 ) assert result.shape == (2048, 2688) @@ -95,8 +95,8 @@ def test_narrow_transposed_w2(self): # transposed w2: (intermediate_size, hidden_size) - hidden at dim 1 expert_data = torch.zeros(1024, 3072) loaded_weight = torch.randn(1024, 2688) - hidden_dim = FusedMoE._get_hidden_dim(shard_dim=0, ndim=2) - result = FusedMoE._narrow_expert_data_for_padding( + hidden_dim = RoutedExperts._get_hidden_dim(shard_dim=0, ndim=2) + result = RoutedExperts._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=hidden_dim ) assert result.shape == (1024, 2688) @@ -105,7 +105,7 @@ def test_narrow_3d_full_load(self): # 3D tensor for full_load path: w2 (num_experts, hidden_size, intermediate) expert_data = torch.zeros(8, 3072, 1024) loaded_weight = torch.randn(8, 2688, 1024) - result = FusedMoE._narrow_expert_data_for_padding( + result = RoutedExperts._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=1 ) assert result.shape == (8, 2688, 1024) @@ -114,7 +114,7 @@ def test_narrow_1d_scale(self): # 1D scale tensor: per-channel w2 scale (hidden_size,) expert_data = torch.zeros(3072) loaded_weight = torch.randn(2688) - result = FusedMoE._narrow_expert_data_for_padding( + result = RoutedExperts._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=0 ) assert result.shape == (2688,) @@ -123,7 +123,7 @@ def test_scalar_weight_no_op(self): # 0-dim tensor should be a no-op expert_data = torch.zeros(3072) loaded_weight = torch.tensor(1.0) - result = FusedMoE._narrow_expert_data_for_padding( + result = RoutedExperts._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=0 ) # ndim == 0, so no narrowing @@ -133,7 +133,7 @@ def test_no_narrowing_when_loaded_weight_larger(self): # Guard: don't narrow if loaded_weight is larger than expert_data expert_data = torch.zeros(2688, 1024) loaded_weight = torch.randn(3072, 1024) - result = FusedMoE._narrow_expert_data_for_padding( + result = RoutedExperts._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=0 ) assert result.shape == (2688, 1024) @@ -143,7 +143,7 @@ def test_negative_hidden_dim_is_noop(self): # Negative hidden_dim should be a safe no-op (0 <= check) expert_data = torch.zeros(3072, 1024) loaded_weight = torch.randn(2688, 1024) - result = FusedMoE._narrow_expert_data_for_padding( + result = RoutedExperts._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=-1 ) # -1 fails the 0 <= check, so no narrowing @@ -155,7 +155,7 @@ def test_only_narrows_hidden_dim(self): # even when other dimensions also differ expert_data = torch.zeros(3072, 2048) loaded_weight = torch.randn(2688, 1024) - result = FusedMoE._narrow_expert_data_for_padding( + result = RoutedExperts._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=0 ) # Only dim 0 (hidden) should be narrowed; dim 1 stays at 2048 @@ -165,7 +165,7 @@ def test_narrowed_data_shares_storage(self): # Verify narrowing returns a view (writes go to original tensor) expert_data = torch.zeros(3072, 1024) loaded_weight = torch.randn(2688, 1024) - result = FusedMoE._narrow_expert_data_for_padding( + result = RoutedExperts._narrow_expert_data_for_padding( expert_data, loaded_weight, hidden_dim=0 ) result.copy_(loaded_weight) @@ -188,8 +188,8 @@ def test_load_w2_with_padding(self): loaded_weight = torch.randn(original_hidden, intermediate) # w2 non-transposed: shard_dim=1, hidden_dim=0 - hidden_dim = FusedMoE._get_hidden_dim(shard_dim=1, ndim=2) - expert_data = FusedMoE._narrow_expert_data_for_padding( + hidden_dim = RoutedExperts._get_hidden_dim(shard_dim=1, ndim=2) + expert_data = RoutedExperts._narrow_expert_data_for_padding( expert_data_full, loaded_weight, hidden_dim=hidden_dim ) expert_data.copy_(loaded_weight) @@ -211,8 +211,8 @@ def test_load_w13_with_padding(self): loaded_weight = torch.randn(intermediate, original_hidden) # w1 non-transposed: shard_dim=0, hidden_dim=1 - hidden_dim = FusedMoE._get_hidden_dim(shard_dim=0, ndim=2) - expert_data = FusedMoE._narrow_expert_data_for_padding( + hidden_dim = RoutedExperts._get_hidden_dim(shard_dim=0, ndim=2) + expert_data = RoutedExperts._narrow_expert_data_for_padding( expert_data_full, loaded_weight, hidden_dim=hidden_dim ) expert_data.copy_(loaded_weight) @@ -233,8 +233,8 @@ def test_load_transposed_w2_with_padding(self): expert_data_full = torch.zeros(intermediate, padded_hidden) loaded_weight = torch.randn(intermediate, original_hidden) - hidden_dim = FusedMoE._get_hidden_dim(shard_dim=0, ndim=2) - expert_data = FusedMoE._narrow_expert_data_for_padding( + hidden_dim = RoutedExperts._get_hidden_dim(shard_dim=0, ndim=2) + expert_data = RoutedExperts._narrow_expert_data_for_padding( expert_data_full, loaded_weight, hidden_dim=hidden_dim ) expert_data.copy_(loaded_weight) @@ -249,8 +249,8 @@ def test_no_padding_is_noop(self): expert_data_full = torch.zeros(hidden, intermediate) loaded_weight = torch.randn(hidden, intermediate) - hidden_dim = FusedMoE._get_hidden_dim(shard_dim=1, ndim=2) - expert_data = FusedMoE._narrow_expert_data_for_padding( + hidden_dim = RoutedExperts._get_hidden_dim(shard_dim=1, ndim=2) + expert_data = RoutedExperts._narrow_expert_data_for_padding( expert_data_full, loaded_weight, hidden_dim=hidden_dim ) expert_data.copy_(loaded_weight) @@ -270,8 +270,8 @@ def test_narrow_shard_dim(self): loaded_weight = torch.randn(original_hidden, original_intermediate) shard_dim = 1 - hidden_dim = FusedMoE._get_hidden_dim(shard_dim=shard_dim, ndim=2) - expert_data = FusedMoE._narrow_expert_data_for_padding( + hidden_dim = RoutedExperts._get_hidden_dim(shard_dim=shard_dim, ndim=2) + expert_data = RoutedExperts._narrow_expert_data_for_padding( expert_data_full, loaded_weight, hidden_dim=hidden_dim, @@ -307,8 +307,8 @@ def test_bnb_shape_mismatch_raises(self): loaded_weight = torch.randint(0, 255, (original_packed, 1), dtype=torch.uint8) - # Minimal FusedMoE mock so weight_loader reaches the BnB path. - moe = MagicMock(spec=FusedMoE) + # Minimal RoutedExperts mock so weight_loader reaches the BnB path. + moe = MagicMock(spec=RoutedExperts) moe.quant_config = None moe.quant_method = MagicMock() moe.quant_method.__class__.__name__ = "BitsAndBytesMethod" @@ -317,7 +317,7 @@ def test_bnb_shape_mismatch_raises(self): # Call the real weight_loader (unbound) with our mock as self. with pytest.raises(ValueError, match="BitsAndBytes"): - FusedMoE.weight_loader( + RoutedExperts.weight_loader( moe, param, loaded_weight, diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index 8c79e0c928f6..ba95e70fee8e 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -360,7 +360,10 @@ def _load_per_channel_weight_scale( if shard_id == "w2": hidden_dim = self._get_hidden_dim(shard_dim, expert_data.ndim) expert_data = self._narrow_expert_data_for_padding( - expert_data, loaded_weight, hidden_dim=hidden_dim + expert_data, + loaded_weight, + hidden_dim=hidden_dim, + shard_dim=shard_dim, ) expert_data.copy_(loaded_weight) elif shard_id in ("w1", "w3"): @@ -400,29 +403,33 @@ def _narrow_expert_data_for_padding( expert_data: torch.Tensor, loaded_weight: torch.Tensor, hidden_dim: int, + shard_dim: int | None = None, ) -> torch.Tensor: - """Narrow expert_data hidden dim to match loaded_weight for padded - hidden_size. + """Narrow expert_data to match loaded_weight for padded dimensions. When backends (e.g., DeepEP) round up hidden_size, weight parameters are larger than checkpoint weights. Narrow the padded hidden dimension - before copying. + before copying. Similarly, when padding occurs on the shard + (intermediate) dimension (e.g. for MXFP4 GEMM), narrow that dimension + as well. Args: expert_data: The (possibly padded) parameter tensor to narrow. loaded_weight: The checkpoint weight tensor with original size. hidden_dim: The dimension index corresponding to hidden_size. Must be non-negative. + shard_dim: The dimension index corresponding to the shard + (intermediate) dimension. Defaults to `None`. """ - if ( - loaded_weight.ndim > 0 - and 0 <= hidden_dim < expert_data.ndim - and hidden_dim < loaded_weight.ndim - and expert_data.shape[hidden_dim] > loaded_weight.shape[hidden_dim] - ): - expert_data = expert_data.narrow( - hidden_dim, 0, loaded_weight.shape[hidden_dim] - ) + dims = (hidden_dim,) if shard_dim is None else (hidden_dim, shard_dim) + if loaded_weight.ndim > 0: + for dim in dims: + if ( + 0 <= dim < expert_data.ndim + and dim < loaded_weight.ndim + and expert_data.shape[dim] > loaded_weight.shape[dim] + ): + expert_data = expert_data.narrow(dim, 0, loaded_weight.shape[dim]) return expert_data def _load_w13( @@ -469,7 +476,10 @@ def _load_w13( expert_data = expert_data.narrow(shard_dim, shard_size, shard_size) hidden_dim = self._get_hidden_dim(shard_dim, expert_data.ndim) expert_data = self._narrow_expert_data_for_padding( - expert_data, loaded_weight, hidden_dim=hidden_dim + expert_data, + loaded_weight, + hidden_dim=hidden_dim, + shard_dim=shard_dim, ) expert_data.copy_(loaded_weight) @@ -501,7 +511,10 @@ def _load_w2( # w2, down_proj: Load into only logical weight of w2. hidden_dim = self._get_hidden_dim(shard_dim, expert_data.ndim) expert_data = self._narrow_expert_data_for_padding( - expert_data, loaded_weight, hidden_dim=hidden_dim + expert_data, + loaded_weight, + hidden_dim=hidden_dim, + shard_dim=shard_dim, ) expert_data.copy_(loaded_weight) From 40d1d6e5af3dcd9240468345e55ca20526504f66 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 1 Jun 2026 23:09:28 +0000 Subject: [PATCH 186/191] maybe fix eplb test Signed-off-by: Bill Nell --- vllm/distributed/eplb/eplb_state.py | 3 ++- vllm/model_executor/layers/fused_moe/runner/moe_runner.py | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 319a5f22c922..10c4d5fbbfe0 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -652,7 +652,8 @@ def _init_should_record_tensor(self, model: "MixtureOfExperts") -> None: # type ) for ls in layer_states: - ls.should_record_tensor = self.should_record_tensor + if ls is not None: + ls.should_record_tensor = self.should_record_tensor def rearrange( self, diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index ba41ae82a4da..172e197e7d01 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -14,6 +14,7 @@ get_pcp_group, tensor_model_parallel_all_reduce, ) +from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.forward_context import ( ForwardContext, get_forward_context, @@ -942,6 +943,10 @@ def get_expert_weights(self) -> Iterable[torch.Tensor]: # EPLB # + @property + def eplb_state(self) -> EplbLayerState | None: + return self.router.eplb_state + def set_eplb_state( self, moe_layer_idx: int, From f7fd06694a88472e2b7acea514ed2f7ace51d859 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 2 Jun 2026 02:38:18 +0000 Subject: [PATCH 187/191] fixes Signed-off-by: Bill Nell --- vllm/distributed/elastic_ep/elastic_execute.py | 4 ++-- vllm/model_executor/layers/fused_moe/layer.py | 1 + vllm/model_executor/layers/fused_moe/routed_experts.py | 2 ++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/elastic_ep/elastic_execute.py b/vllm/distributed/elastic_ep/elastic_execute.py index 2cd6decb3a58..bc0c3f668057 100644 --- a/vllm/distributed/elastic_ep/elastic_execute.py +++ b/vllm/distributed/elastic_ep/elastic_execute.py @@ -466,8 +466,8 @@ def switch_and_prepare(self) -> None: self._commit_staged_moe_quant_methods() # Legacy modular methods need to be recreated for the new EP size. for module in moe_modules: - if getattr(module.quant_method, "wraps_legacy_quant_method", False): - module._replace_quant_method(module.quant_method.old_quant_method) + if getattr(module._quant_method, "wraps_legacy_quant_method", False): + module._replace_quant_method(module._quant_method.old_quant_method) prepare_communication_buffer_for_model(self.worker.model_runner.model) eplb_model_state.communicator = create_eplb_communicator( diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 5a0e443b0e5a..e99734a62d36 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -340,6 +340,7 @@ def FusedMoE( # Extra params that are needed by quant_methods, pass along for now # Prefer getting these from other sources, e.g. moe_config or # router object + renormalize=renormalize, use_grouped_topk=use_grouped_topk, num_expert_group=num_expert_group, topk_group=topk_group, diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index ba95e70fee8e..afb526746a21 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -66,6 +66,7 @@ def __init__( # Prefer getting these from other sources, e.g. moe_config or # router object # + renormalize: bool = True, use_grouped_topk: bool = False, num_expert_group: int | None = None, topk_group: int | None = None, @@ -95,6 +96,7 @@ def __init__( # or some other config. self.top_k = self.moe_config.experts_per_token self.activation = self.moe_config.activation + self.renormalize = renormalize self.use_grouped_topk = use_grouped_topk self.num_expert_group = num_expert_group self.topk_group = topk_group From a1df5b0221e5a5b3d729e2ae035a4821d441e1c9 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 2 Jun 2026 02:49:48 +0000 Subject: [PATCH 188/191] fix transformers test Signed-off-by: Bill Nell --- .../model_executor/models/transformers/moe.py | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index 362c997cb327..4eddf3a78a7f 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -58,8 +58,8 @@ class TransformersFusedMoE(MoERunner): # --8<-- [end:transformers_fused_moe] def __init__(self, *args, moe_state: TransformersMoEState, **kwargs): super().__init__(*args, **kwargs) - self.moe_state = moe_state - self.moe_state.is_sequence_parallel = self.moe_config.is_sequence_parallel + self._moe_state = moe_state + self._moe_state.is_sequence_parallel = self.moe_config.is_sequence_parallel def forward( self, @@ -71,14 +71,19 @@ def forward( """In Transformers `experts.forward` will have this signature. We discard any extra kwargs because we cannot use them here.""" + return torch.ops.vllm.transformers_moe_forward( + hidden_states, + topk_ids.to(torch.int32), + topk_weights.to(torch.float32), + self.layer_name, + ) - self.moe_state.topk_ids = topk_ids.to(torch.int32) - topk_weights = topk_weights.to(torch.float32) - - # Clone hidden_states because it will be mutated in-place in FusedMoE - # TODO(bnell): figure out a way to avoid calling runner directly. - # it is a hack that the weight are being passed via logits. - return super().forward(hidden_states.clone(), topk_weights) + def _forward_super( + self, + hidden_states: torch.Tensor, + topk_weights: torch.Tensor, + ) -> torch.Tensor: + return super().forward(hidden_states, topk_weights) def load_weights( self, weights: Iterable[tuple[str, torch.Tensor]] @@ -86,7 +91,6 @@ def load_weights( return self.routed_experts.load_weights(weights) -# TODO(bnell): Is this still needed? Probably broken if it is. def transformers_moe_forward( hidden_states: torch.Tensor, topk_ids: torch.Tensor, @@ -96,11 +100,8 @@ def transformers_moe_forward( """Store the `topk_ids` in the layer and call the actual forward.""" forward_context: ForwardContext = get_forward_context() self = forward_context.no_compile_layers[layer_name] - self._topk_ids = topk_ids - # Clone hidden_states because it will be mutated in-place in FusedMoE - # TODO(bnell): figure out a way to avoid calling runner directly. - # it is a hack that the weight are being passed via logits. - return self.runner.forward(hidden_states.clone(), topk_weights) + self._moe_state.topk_ids = topk_ids + return self._forward_super(hidden_states, topk_weights) def transformers_moe_forward_fake( @@ -303,6 +304,7 @@ def custom_routing_function( `topk_ids` we stored in the layer earlier.""" topk_weights = gating_output topk_ids = moe_state.topk_ids + assert topk_ids is not None # Handle all gather in expert parallel if topk_ids.size(0) != hidden_states.size(0): dp_metadata = get_forward_context().dp_metadata From ad9a87e03f909d2cec32a12fa15010e1f20e8804 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 2 Jun 2026 03:12:11 +0000 Subject: [PATCH 189/191] fix params_dtype + add comment Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 8 ++++--- .../layers/fused_moe/routed_experts.py | 21 +++---------------- .../model_executor/models/transformers/moe.py | 2 ++ 3 files changed, 10 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index e99734a62d36..db4cf69930f6 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -291,17 +291,18 @@ def FusedMoE( hash_indices_table=hash_indices_table, ) + if params_dtype is None: + params_dtype = torch.get_default_dtype() + # FIXME (varun): We should have a better way of inferring the activation # datatype. This works for now as the tensor datatype entering the MoE # operation is typically unquantized (i.e. float16/bfloat16). if vllm_config.model_config is not None: moe_in_dtype = vllm_config.model_config.dtype - elif params_dtype is not None: + else: # TODO (bnell): This is a hack to get test_mixtral_moe to work # since model_config is not set in the pytest test. moe_in_dtype = params_dtype - else: - moe_in_dtype = torch.get_default_dtype() moe_config = FusedMoEConfig( num_experts=global_num_experts, @@ -330,6 +331,7 @@ def FusedMoE( if routed_experts_cls is None: routed_experts_cls = RoutedExperts + assert params_dtype is not None routed_experts = routed_experts_cls( layer_name, params_dtype, diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index afb526746a21..b61b45d5f80b 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -8,7 +8,6 @@ import torch from torch.nn.parameter import UninitializedParameter -from vllm.config import get_current_vllm_config from vllm.distributed.eplb.eplb_state import EplbState from vllm.logger import init_logger from vllm.model_executor.custom_op import PluggableLayer @@ -56,7 +55,7 @@ class RoutedExperts(PluggableLayer): def __init__( self, layer_name: str, - params_dtype: torch.dtype | None, + params_dtype: torch.dtype, moe_config: FusedMoEConfig, quant_config: QuantizationConfig | None, expert_map_manager: ExpertMapManager, @@ -86,6 +85,7 @@ def __init__( self.hidden_size = moe_config.hidden_dim self.global_num_experts = moe_config.num_experts self.local_num_experts = moe_config.num_local_experts + self.params_dtype = params_dtype # Register buffers for state_dict compatibility self.update_expert_map_info() @@ -116,26 +116,11 @@ def __init__( # Round up hidden size and update moe_config. # TODO: move roundup to _get_quant_method? - # FIXME (varun): We should have a better way of inferring the activation - # datatype. This works for now as the tensor datatype entering the MoE - # operation is typically unquantized (i.e. float16/bfloat16). - vllm_config = get_current_vllm_config() - - if vllm_config.model_config is not None: - moe_in_dtype = vllm_config.model_config.dtype - elif params_dtype is not None: - # TODO (bnell): This is a hack to get test_mixtral_moe to work - # since model_config is not set in the pytest test. - moe_in_dtype = params_dtype - else: - params_dtype = torch.get_default_dtype() - moe_in_dtype = params_dtype - self.hidden_size, self.intermediate_size_per_partition = ( self.quant_method.maybe_roundup_sizes( self.hidden_size, self.moe_config.intermediate_size_per_partition, - moe_in_dtype, + self.moe_config.in_dtype, self.moe_config.moe_parallel_config, ) ) diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index 4eddf3a78a7f..60e39b330f05 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -71,6 +71,8 @@ def forward( """In Transformers `experts.forward` will have this signature. We discard any extra kwargs because we cannot use them here.""" + # Note: we need to forward through a custom op so the topk_ids + # can be transferred without interfering with cudagraphs. return torch.ops.vllm.transformers_moe_forward( hidden_states, topk_ids.to(torch.int32), From 8c7976dbaee170ca80eeee2276b573aade0d5c5b Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 2 Jun 2026 19:57:14 +0000 Subject: [PATCH 190/191] fix elastic_ep Signed-off-by: Bill Nell --- vllm/distributed/elastic_ep/elastic_execute.py | 7 ++----- .../layers/fused_moe/routed_experts.py | 14 +++++++++++--- .../layers/fused_moe/runner/moe_runner.py | 9 ++++++++- .../layers/fused_moe/runner/shared_experts.py | 4 ++++ 4 files changed, 25 insertions(+), 9 deletions(-) diff --git a/vllm/distributed/elastic_ep/elastic_execute.py b/vllm/distributed/elastic_ep/elastic_execute.py index bc0c3f668057..56ddbe01e160 100644 --- a/vllm/distributed/elastic_ep/elastic_execute.py +++ b/vllm/distributed/elastic_ep/elastic_execute.py @@ -76,7 +76,7 @@ def batch_transfer_weights( all_params = [] for name, param in state_dict.items(): - if name.endswith("expert_map"): + if name.endswith("expert_map") or name.find("._shared_experts") != -1: continue if param.data_ptr() not in expert_weights_set: all_params.append(param.data) @@ -396,10 +396,7 @@ def switch_and_prepare(self) -> None: ep_group = get_ep_group() for module in moe_modules: new_moe_config = self._make_eep_moe_config(module, dp_group, ep_group) - module.moe_config.num_experts = new_moe_config.num_experts - module.global_num_experts = module.moe_config.num_experts - module.moe_parallel_config = new_moe_config.moe_parallel_config - module.moe_config.moe_parallel_config = module.moe_parallel_config + module._set_moe_config(new_moe_config) # Update EPLB state eplb_state = self.worker.model_runner.eplb_state diff --git a/vllm/model_executor/layers/fused_moe/routed_experts.py b/vllm/model_executor/layers/fused_moe/routed_experts.py index b61b45d5f80b..6931fecafce2 100644 --- a/vllm/model_executor/layers/fused_moe/routed_experts.py +++ b/vllm/model_executor/layers/fused_moe/routed_experts.py @@ -164,6 +164,16 @@ def __init__( self.quant_method.create_weights(layer=self, **moe_quant_params) + # TODO(bnell): Temporary hack. Get rid of this. + def _replace_quant_method(self, quant_method: FusedMoEMethodBase): + self.quant_method = quant_method + + # TODO(bnell): Hack for elastic_ep. Get rid of this + def _set_moe_config(self, new_moe_config: FusedMoEConfig): + self.moe_config = new_moe_config + self.global_num_experts = new_moe_config.num_experts + # local experts? + def _get_quant_method( self, prefix: str, @@ -205,9 +215,7 @@ def use_ep(self) -> bool: @property def expert_map(self) -> torch.Tensor | None: return ( - self.expert_map_manager.expert_map - if not self.rocm_aiter_fmoe_enabled - else self.expert_map_manager.expert_mask + self._expert_map if not self.rocm_aiter_fmoe_enabled else self.expert_mask ) def update_expert_map_info(self): diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py index 172e197e7d01..cf8bd73fbf6c 100644 --- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -315,7 +315,14 @@ def is_internal_router(self) -> bool: # TODO(bnell): Temporary hack. Get rid of this. def _replace_quant_method(self, quant_method: FusedMoEMethodBase): - self.routed_experts.quant_method = quant_method + self.routed_experts._replace_quant_method(quant_method) + + # TODO(bnell): Hack for elastic_ep. Get rid of this + def _set_moe_config(self, new_moe_config: FusedMoEConfig): + self.moe_config = new_moe_config + self.routed_experts._set_moe_config(new_moe_config) + if self._shared_experts is not None: + self._shared_experts._set_moe_config(new_moe_config) def _maybe_fuse_gate_weights(self): """Fuse router and shared expert gate weights on first call. diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py index d44d60b19be5..dc4c99ea3e57 100644 --- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py +++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py @@ -71,6 +71,10 @@ def __init__( if self._stream is not None: logger.debug_once("Enabled separate cuda stream for MoE shared_experts") + # TODO(bnell): Hack for elastic_ep. Get rid of this + def _set_moe_config(self, new_moe_config: FusedMoEConfig): + self.moe_config = new_moe_config + @property def _disable_shared_experts_overlap(self) -> bool: # Disable shared expert overlap if: From 83af31b8f280f017e8d6686a3a0dd2fe93e9cb5f Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 2 Jun 2026 20:31:14 +0000 Subject: [PATCH 191/191] fix ernie path Signed-off-by: Bill Nell --- vllm/model_executor/models/ernie45_vl_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 66b135cc30a6..0bdb567c0aaf 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -708,7 +708,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: delta = moe_offset - vision_expert_start_idx name = name.replace( f".experts.{moe_offset}", - f".vision_experts.{routed_experts}{delta}", + f".vision_experts{routed_experts}.{delta}", ) for mapping in expert_params_mapping: