From ced1799dd77fad9fd2392c5dd303a707eac94aed Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 23 Apr 2026 20:50:13 +0000 Subject: [PATCH 01/21] expert map manager Signed-off-by: Bill Nell --- .../layers/fused_moe/expert_map_manager.py | 514 ++++++++++++++++++ 1 file changed, 514 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/expert_map_manager.py diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py new file mode 100644 index 000000000000..5364f4163102 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -0,0 +1,514 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Expert Map Manager for MoE layers. + +This module contains the ExpertMapManager class which manages expert ID +mappings and placement strategies for Expert Parallelism in MoE models. +""" + +import torch + +from vllm.config.parallel import ExpertPlacementStrategy +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig +from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( + init_aiter_topK_meta_data, +) + +logger = init_logger(__name__) + + +def determine_expert_map( + ep_size: int, + ep_rank: int, + global_num_experts: int, + expert_placement_strategy: ExpertPlacementStrategy = "linear", + num_fused_shared_experts: int = 0, + return_expert_mask: bool = False, +) -> tuple[int, torch.Tensor | None, torch.Tensor | None]: + """ + Calculates how many experts should be assigned to each rank for EP and + creates a mapping from global to local expert index. Experts are + distributed evenly across ranks. Any remaining are assigned to the + last rank. + + Args: + ep_size: The size of the expert parallel group + ep_rank: The rank of the current process in the expert parallel + group + global_num_experts: The total number of experts in the model. + expert_placement_strategy: The expert placement strategy. + num_fused_shared_experts: Number of fused shared experts (for AITER) + return_expert_mask: Whether to return expert mask for AITER + + Returns: + tuple[int, Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple containing: + - local_num_experts (int): The number of experts assigned + to the current rank. + - expert_map (Optional[torch.Tensor]): A tensor of shape + (global_num_experts,) mapping from global to local index. + Contains -1 for experts not assigned to the current rank. + Returns None if ep_size is 1. + - expert_mask (Optional[torch.Tensor]): A tensor of shape + (global_num_experts + num_fused_shared_experts + 1,) + containing 1 for experts assigned to the current rank + and 0 for sentinel. + Returns None if ep_size is 1. + Used only when AITER MOE is enabled. + """ + from typing import get_args + + assert ep_size > 0 + if ep_size == 1: + return (global_num_experts, None, None) + + # Distribute experts as evenly as possible to each rank. + base_experts = global_num_experts // ep_size + remainder = global_num_experts % ep_size + local_num_experts = base_experts + 1 if ep_rank < remainder else base_experts + + # Create a tensor of size num_experts filled with -1 + expert_map = torch.full((global_num_experts,), -1, dtype=torch.int32) + # Create an expert map for the local experts + if expert_placement_strategy == "linear": + start_idx = ep_rank * base_experts + min(ep_rank, remainder) + expert_map[start_idx : start_idx + local_num_experts] = torch.arange( + 0, local_num_experts, dtype=torch.int32 + ) + elif expert_placement_strategy == "round_robin": + local_log_experts = torch.arange( + ep_rank, global_num_experts, ep_size, dtype=torch.int32 + ) + + expert_map[local_log_experts] = torch.arange( + 0, local_num_experts, dtype=torch.int32 + ) + else: + raise ValueError( + "Unsupported expert placement strategy " + f"'{expert_placement_strategy}', expected one of " + f"{get_args(ExpertPlacementStrategy)}" + ) + + expert_mask = None + if return_expert_mask: + expert_mask = torch.ones( + (global_num_experts + num_fused_shared_experts + 1,), dtype=torch.int32 + ) + expert_mask[-1] = 0 + expert_mask[:global_num_experts] = expert_map > -1 + expert_map = torch.cat( + ( + expert_map, + torch.tensor( + [local_num_experts + i for i in range(num_fused_shared_experts)], + dtype=torch.int32, + ), + ), + dim=0, + ) + + return (local_num_experts, expert_map, expert_mask) + + +def determine_expert_placement_strategy( + expert_placement_strategy: ExpertPlacementStrategy, + moe_parallel_config: FusedMoEParallelConfig, + num_expert_group: int | None, + num_redundant_experts: int, + enable_eplb: bool, +) -> ExpertPlacementStrategy: + if expert_placement_strategy == "round_robin": + round_robin_supported = ( + (num_expert_group is not None and num_expert_group > 1) + and num_redundant_experts == 0 + and not enable_eplb + ) + + if not round_robin_supported: + logger.warning( + "Round-robin expert placement is only supported for " + "models with multiple expert groups and no redundant " + "experts. Falling back to linear expert placement." + ) + return "linear" + if ( + moe_parallel_config.use_all2all_kernels + and not moe_parallel_config.use_deepep_ll_kernels + and not moe_parallel_config.use_nixl_ep_kernels + ): + logger.warning( + "Round-robin expert placement currently only supports " + "the DeepEP low-latency or NIXL EP backend, but '%s' was configured. " + "Falling back to linear expert placement.", + moe_parallel_config.all2all_backend, + ) + return "linear" + + return expert_placement_strategy + + +class ExpertMapManager: + """ + Manages expert ID mappings and placement for Expert Parallelism. + + Responsibilities: + - Calculate local vs global expert counts + - Map between global, local, and physical expert IDs + - Manage placement strategies (linear, round_robin) + - Maintain routing tables for round-robin placement + - Support dynamic reconfiguration of EP topology + """ + + def __init__( + self, + max_num_batched_tokens: int, + top_k: int, + global_num_experts: int, + logical_num_experts: int, + num_redundant_experts: int, + num_expert_group: int | None, + moe_parallel_config: FusedMoEParallelConfig, + placement_strategy: ExpertPlacementStrategy, + enable_eplb: bool, + num_fused_shared_experts: int = 0, + rocm_aiter_enabled: bool = False, + device: torch.device | None = None, + ): + """ + Initialize expert map manager. + + Args: + global_num_experts: Total number of experts across all ranks + logical_num_experts: Number of logical (non-redundant) experts + moe_parallel_config: MoE parallel configuration (contains ep_size, + ep_rank, backend flags) + placement_strategy: Strategy for placing experts ('linear' or 'round_robin') + num_fused_shared_experts: Number of fused shared experts (for AITER) + rocm_aiter_enabled: Whether ROCm AITER fusion is enabled + device: Device for tensor allocations + """ + self.global_num_experts = global_num_experts + self.logical_num_experts = logical_num_experts + self.moe_parallel_config = moe_parallel_config + self.num_fused_shared_experts = num_fused_shared_experts + self.rocm_aiter_enabled = rocm_aiter_enabled + self.device = device + + if moe_parallel_config.use_ep: + # Determine expert placement strategy before creating manager + # TODO move into EMM + placement_strategy = determine_expert_placement_strategy( + expert_placement_strategy=placement_strategy, + moe_parallel_config=moe_parallel_config, + num_expert_group=num_expert_group, + num_redundant_experts=num_redundant_experts, + enable_eplb=enable_eplb, + ) + + # Determine effective placement strategy + self._placement_strategy = self._determine_placement_strategy( + placement_strategy + ) + + # Calculate expert mappings + self._calculate_expert_maps() + + # Initialize routing tables if needed + self._maybe_init_routing_tables() + + self._init_aiter_shared_experts_topK_buffer( + dp_size=self.moe_parallel_config.dp_size, + top_k=top_k, + max_num_batched_tokens=max_num_batched_tokens, + ) + + if self.use_ep and self.rocm_aiter_enabled: + expert_mask = self.expert_mask + assert expert_mask is None or torch.all( + (expert_mask == 0) | (expert_mask == 1) + ), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s." + + # Log EP configuration (move into EMM?) + if self.use_ep: + logger.info_once( + "[EP Rank %s/%s] Expert parallelism is enabled. Expert " + "placement strategy: %s. Local/global" + " number of experts: %s/%s. Experts local to global index map:" + " %s.", + self.ep_rank, + self.ep_size, + self.placement_strategy, + self.local_num_experts, + self.global_num_experts, + self.get_compressed_map_string(), + ) + + def _init_aiter_shared_experts_topK_buffer( + self, + dp_size: int, + top_k: int, + max_num_batched_tokens: int, + ): + if self.num_fused_shared_experts > 0: + init_aiter_topK_meta_data( + n_routed_experts=self.global_num_experts, + n_shared_experts=self.num_fused_shared_experts, + top_k=top_k, + tp_rank=self.ep_rank if self.use_ep else self.tp_rank, + tp_size=self.ep_size if self.use_ep else self.tp_size, + shared_experts_score=1.0, + max_num_tokens=max_num_batched_tokens * dp_size, + is_EP=self.use_ep, + ) + self._local_num_experts += self.num_fused_shared_experts + + @property + def use_ep(self) -> int: + return self.moe_parallel_config.use_ep + + @property + def ep_size(self) -> int: + return self.moe_parallel_config.ep_size + + @property + def ep_rank(self) -> int: + return self.moe_parallel_config.ep_rank + + @property + def tp_size(self) -> int: + return self.moe_parallel_config.tp_size + + @property + def tp_rank(self) -> int: + return self.moe_parallel_config.tp_rank + + @property + def local_num_experts(self) -> int: + return self._local_num_experts + + @property + def expert_map(self) -> torch.Tensor | None: + """ + Mapping from global expert ID to local expert ID. + + Returns tensor of shape (global_num_experts,) where: + - expert_map[global_id] = local_id if expert is on this rank + - expert_map[global_id] = -1 if expert is not on this rank + + Returns None if EP is not enabled (ep_size == 1). + """ + return self._expert_map + + @property + def expert_mask(self) -> torch.Tensor | None: + """ + Expert mask for AITER fusion (ROCm-specific). + + Returns tensor of shape (global_num_experts + num_fused_shared + 1,) + where 1 indicates expert is on this rank, 0 otherwise. + """ + return self._expert_mask + + @property + def placement_strategy(self) -> ExpertPlacementStrategy: + """Expert placement strategy ('linear' or 'round_robin').""" + return self._placement_strategy + + @property + def routing_tables( + self, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: + """ + Routing tables for round-robin placement. + + Returns (global_to_physical, physical_to_global, local_to_global) + or None if not using round-robin or tables not needed. + """ + if not hasattr(self, "_routing_tables"): + return None + return self._routing_tables + + def map_global_to_local(self, global_id: int) -> int: + """ + Map global expert ID to local expert ID. + + Args: + global_id: Global expert ID (0 to global_num_experts - 1) + + Returns: + Local expert ID (0 to local_num_experts - 1) + + Raises: + ValueError: If expert is not on this rank + """ + if self._expert_map is None: + return global_id + + return self._expert_map[global_id].item() + + def is_local_expert(self, global_id: int) -> bool: + """Check if expert is assigned to this rank.""" + if self._expert_map is None: + return True + return self._expert_map[global_id] != -1 + + def get_local_expert_ids(self) -> list[int]: + """Get list of global IDs for experts on this rank.""" + if self._expert_map is None: + return list(range(self.global_num_experts)) + + return torch.where(self._expert_map != -1)[0].tolist() + + def update( + self, + new_ep_size: int | None = None, + new_ep_rank: int | None = None, + ) -> None: + """ + Update expert mappings for new EP configuration. + + Used during dynamic reconfiguration (e.g., elastic scaling). + + Args: + new_ep_size: New EP world size (if changed) + new_ep_rank: New EP rank (if changed) + """ + if new_ep_size is not None: + self.moe_parallel_config.ep_size = new_ep_size + if new_ep_rank is not None: + self.moe_parallel_config.ep_rank = new_ep_rank + + # Recalculate everything + self._placement_strategy = self._determine_placement_strategy( + self._placement_strategy + ) + self._calculate_expert_maps() + self._maybe_init_routing_tables() + + def get_compressed_map_string(self) -> str: + """ + Get compressed string representation of expert map for logging. + + Returns string mapping local to global expert IDs. + """ + if self._expert_map is None: + return f"[0..{self.global_num_experts - 1}]" + + global_indices = torch.where(self._expert_map != -1)[0] + local_indices = self._expert_map[global_indices] + return ", ".join( + f"{local_index.item()}->{global_index.item()}" + for local_index, global_index in zip(local_indices, global_indices) + ) + + # Private methods + + def _determine_placement_strategy( + self, requested_strategy: ExpertPlacementStrategy + ) -> ExpertPlacementStrategy: + """Determine effective placement strategy based on config.""" + if requested_strategy != "round_robin": + return requested_strategy + + # Round-robin requires specific conditions + if self.ep_size == 1: + return "linear" + + if ( + self.moe_parallel_config.use_all2all_kernels + and not self.moe_parallel_config.use_deepep_ll_kernels + and not self.moe_parallel_config.use_nixl_ep_kernels + ): + logger.warning( + "Round-robin placement requires DeepEP-ll or NIXL backend. " + "Falling back to linear." + ) + return "linear" + + return "round_robin" + + def _calculate_expert_maps(self) -> None: + """Calculate expert mappings based on placement strategy.""" + if self.ep_size == 1: + # No EP, all experts are local + self._local_num_experts = self.global_num_experts + self._expert_map = None + self._expert_mask = None + return + + # Call determine_expert_map with current config + ( + self._local_num_experts, + self._expert_map, + self._expert_mask, + ) = determine_expert_map( + ep_size=self.ep_size, + ep_rank=self.ep_rank, + global_num_experts=self.global_num_experts, + expert_placement_strategy=self._placement_strategy, + num_fused_shared_experts=self.num_fused_shared_experts, + return_expert_mask=self.rocm_aiter_enabled, + ) + + # Move to device if specified + if self.device is not None: + if self._expert_map is not None: + self._expert_map = self._expert_map.to(self.device) + if self._expert_mask is not None: + self._expert_mask = self._expert_mask.to(self.device) + + def _maybe_init_routing_tables(self) -> None: + """Initialize routing tables if needed for round-robin.""" + if self._placement_strategy != "round_robin": + return + + if ( + not self.moe_parallel_config.use_deepep_ll_kernels + and not self.moe_parallel_config.use_nixl_ep_kernels + ): + return + + if self._expert_map is None: + return + + self._routing_tables = self._ensure_round_robin_expert_routing_tables() + + def _ensure_round_robin_expert_routing_tables( + self, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Build routing tables for round-robin placement.""" + device_kwargs = {"device": self.device} if self.device is not None else {} + + global_indices = torch.arange( + self.global_num_experts, dtype=torch.long, **device_kwargs + ) + owner = torch.remainder(global_indices, self.ep_size) + local_index = torch.div(global_indices, self.ep_size, rounding_mode="floor") + + base = self.global_num_experts // self.ep_size + remainder = self.global_num_experts % self.ep_size + physical_offset = owner * base + + if remainder > 0: + remainder_tensor = torch.tensor( + remainder, dtype=torch.long, **device_kwargs + ) + physical_offset = physical_offset + torch.minimum(owner, remainder_tensor) + + global_to_physical = physical_offset + local_index + physical_to_global = torch.empty_like(global_to_physical) + physical_to_global[global_to_physical] = global_indices + + local_global = torch.arange( + self.ep_rank, + self.global_num_experts, + self.ep_size, + dtype=torch.long, + **device_kwargs, + ) + if local_global.numel() != self._local_num_experts: + local_global = local_global[: self._local_num_experts] + + return (global_to_physical, physical_to_global, local_global) From ba52a86c21303614c0fc6fbd56ff7103c81431f7 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 23 Apr 2026 21:07:52 +0000 Subject: [PATCH 02/21] wip Signed-off-by: Bill Nell --- tests/distributed/test_expert_placement.py | 4 +- .../kernels/moe/test_moe_permute_unpermute.py | 4 +- .../layers/fused_moe/expert_map_manager.py | 17 + vllm/model_executor/layers/fused_moe/layer.py | 330 +++--------------- 4 files changed, 73 insertions(+), 282 deletions(-) diff --git a/tests/distributed/test_expert_placement.py b/tests/distributed/test_expert_placement.py index 8b3a64b9c134..46f63408f467 100644 --- a/tests/distributed/test_expert_placement.py +++ b/tests/distributed/test_expert_placement.py @@ -3,7 +3,9 @@ import pytest -from vllm.model_executor.layers.fused_moe.layer import determine_expert_map +from vllm.model_executor.layers.fused_moe.expert_map_manager import ( + determine_expert_map, +) def verify_round_robin_pattern(expert_map, ep_rank, ep_size, global_num_experts): diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py index 92126171a17b..5aafb89589fd 100644 --- a/tests/kernels/moe/test_moe_permute_unpermute.py +++ b/tests/kernels/moe/test_moe_permute_unpermute.py @@ -10,7 +10,9 @@ import torch from vllm.model_executor.layers.fused_moe import fused_topk -from vllm.model_executor.layers.fused_moe.layer import determine_expert_map +from vllm.model_executor.layers.fused_moe.expert_map_manager import ( + determine_expert_map, +) from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( moe_permute, moe_permute_unpermute_supported, diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 5364f4163102..e587a854ea23 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -365,6 +365,9 @@ def update( self, new_ep_size: int | None = None, new_ep_rank: int | None = None, + dp_size: int | None = None, + top_k: int | None = None, + max_num_batched_tokens: int | None = None, ) -> None: """ Update expert mappings for new EP configuration. @@ -374,6 +377,10 @@ def update( Args: new_ep_size: New EP world size (if changed) new_ep_rank: New EP rank (if changed) + dp_size: New DP size (if changed, for AITER buffer reinitialization) + top_k: New top_k (if changed, for AITER buffer reinitialization) + max_num_batched_tokens: New max batched tokens (if changed, for AITER + buffer reinitialization) """ if new_ep_size is not None: self.moe_parallel_config.ep_size = new_ep_size @@ -387,6 +394,16 @@ def update( self._calculate_expert_maps() self._maybe_init_routing_tables() + # Reinitialize AITER buffer if needed and parameters provided + if self.num_fused_shared_experts > 0 and all( + x is not None for x in [dp_size, top_k, max_num_batched_tokens] + ): + self._init_aiter_shared_experts_topK_buffer( + dp_size=dp_size, # type: ignore + top_k=top_k, # type: ignore + max_num_batched_tokens=max_num_batched_tokens, # type: ignore + ) + def get_compressed_map_string(self) -> str: """ Get compressed string representation of expert map for logging. diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 7174cdd88f25..5fc7c6cc5467 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -3,13 +3,13 @@ from collections.abc import Callable, Iterable from enum import Enum -from typing import Literal, cast, get_args, overload +from typing import Literal, cast, overload import torch from torch.nn.parameter import UninitializedParameter from vllm._aiter_ops import rocm_aiter_ops -from vllm.config import VllmConfig, get_current_vllm_config +from vllm.config import get_current_vllm_config from vllm.config.parallel import ExpertPlacementStrategy from vllm.distributed import ( get_dp_group, @@ -26,15 +26,15 @@ FusedMoEQuantConfig, RoutingMethodType, ) +from vllm.model_executor.layers.fused_moe.expert_map_manager import ( + ExpertMapManager, +) from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, ) from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import ( FusedMoEModularMethod, ) -from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( - init_aiter_topK_meta_data, -) from vllm.model_executor.layers.fused_moe.router.router_factory import ( create_fused_moe_router, ) @@ -68,152 +68,6 @@ class FusedMoeWeightScaleSupported(Enum): BLOCK = "block" -def determine_expert_map( - ep_size: int, - ep_rank: int, - global_num_experts: int, - expert_placement_strategy: ExpertPlacementStrategy = "linear", - num_fused_shared_experts: int = 0, - return_expert_mask: bool = False, -) -> tuple[int, torch.Tensor | None, torch.Tensor | None]: - """ - Calculates how many experts should be assigned to each rank for EP and - creates a mapping from global to local expert index. Experts are - distributed evenly across ranks. Any remaining are assigned to the - last rank. - - Args: - ep_size: The size of the expert parallel group - ep_rank: The rank of the current process in the expert parallel - group - global_num_experts: The total number of experts in the model. - expert_placement_strategy: The expert placement strategy. - - Returns: - tuple[int, Optional[torch.Tensor]]: A tuple containing: - - local_num_experts (int): The number of experts assigned - to the current rank. - - expert_map (Optional[torch.Tensor]): A tensor of shape - (global_num_experts,) mapping from global to local index. - Contains -1 for experts not assigned to the current rank. - Returns None if ep_size is 1. - - expert_mask (Optional[torch.Tensor]): A tensor of shape - (global_num_experts + num_fused_shared_experts + 1,) - containing 1 for experts assigned to the current rank - and 0 for sentinel. - Returns None if ep_size is 1. - Used only when AITER MOE is enabled. - """ - assert ep_size > 0 - if ep_size == 1: - return (global_num_experts, None, None) - - # Distribute experts as evenly as possible to each rank. - base_experts = global_num_experts // ep_size - remainder = global_num_experts % ep_size - local_num_experts = base_experts + 1 if ep_rank < remainder else base_experts - - # Create a tensor of size num_experts filled with -1 - expert_map = torch.full((global_num_experts,), -1, dtype=torch.int32) - # Create an expert map for the local experts - if expert_placement_strategy == "linear": - start_idx = ep_rank * base_experts + min(ep_rank, remainder) - expert_map[start_idx : start_idx + local_num_experts] = torch.arange( - 0, local_num_experts, dtype=torch.int32 - ) - elif expert_placement_strategy == "round_robin": - local_log_experts = torch.arange( - ep_rank, global_num_experts, ep_size, dtype=torch.int32 - ) - - expert_map[local_log_experts] = torch.arange( - 0, local_num_experts, dtype=torch.int32 - ) - else: - raise ValueError( - "Unsupported expert placement strategy " - f"'{expert_placement_strategy}', expected one of " - f"{get_args(ExpertPlacementStrategy)}" - ) - - expert_mask = None - if return_expert_mask: - expert_mask = torch.ones( - (global_num_experts + num_fused_shared_experts + 1,), dtype=torch.int32 - ) - expert_mask[-1] = 0 - expert_mask[:global_num_experts] = expert_map > -1 - expert_map = torch.cat( - ( - expert_map, - torch.tensor( - [local_num_experts + i for i in range(num_fused_shared_experts)], - dtype=torch.int32, - ), - ), - dim=0, - ) - - return (local_num_experts, expert_map, expert_mask) - - -def determine_expert_placement_strategy( - expert_placement_strategy: ExpertPlacementStrategy, - moe_parallel_config: FusedMoEParallelConfig, - num_expert_group: int | None, - num_redundant_experts: int, - enable_eplb: bool, -) -> ExpertPlacementStrategy: - if expert_placement_strategy == "round_robin": - round_robin_supported = ( - (num_expert_group is not None and num_expert_group > 1) - and num_redundant_experts == 0 - and not enable_eplb - ) - - if not round_robin_supported: - logger.warning( - "Round-robin expert placement is only supported for " - "models with multiple expert groups and no redundant " - "experts. Falling back to linear expert placement." - ) - return "linear" - if ( - moe_parallel_config.use_all2all_kernels - and not moe_parallel_config.needs_round_robin_routing_tables - ): - logger.warning( - "Round-robin expert placement currently only supports " - "the DeepEP low-latency or NIXL EP backend, but '%s' was configured. " - "Falling back to linear expert placement.", - moe_parallel_config.all2all_backend, - ) - return "linear" - - return expert_placement_strategy - - -def get_compressed_expert_map(expert_map: torch.Tensor) -> str: - """ - Compresses the expert map by removing any -1 entries. - - Args: - expert_map (torch.Tensor): A tensor of shape (global_num_experts,) - mapping from global to local index. Contains -1 for experts not - assigned to the current rank. - - Returns: - str: A string mapping from local to global index. - Using str to support hashing for logging once only. - """ - global_indices = torch.where(expert_map != -1)[0] - local_indices = expert_map[global_indices] - return ", ".join( - f"{local_index.item()}->{global_index.item()}" - for local_index, global_index in zip(local_indices, global_indices) - ) - - # --8<-- [start:fused_moe] @PluggableLayer.register("fused_moe") class FusedMoE(PluggableLayer): @@ -384,54 +238,34 @@ def __init__( "Redundant experts are only supported with EPLB." ) - self.expert_placement_strategy = determine_expert_placement_strategy( - expert_placement_strategy=self.expert_placement_strategy, - moe_parallel_config=self.moe_parallel_config, - num_expert_group=num_expert_group, - num_redundant_experts=num_redundant_experts, - enable_eplb=self.enable_eplb, - ) + # Create ExpertMapManager to handle expert mapping and placement + self.expert_map_manager = ExpertMapManager( + max_num_batched_tokens=vllm_config.scheduler_config.max_num_batched_tokens, + top_k=top_k, + global_num_experts=self.global_num_experts, + logical_num_experts=self.logical_num_experts, + num_redundant_experts=num_redundant_experts, + num_expert_group=num_expert_group, + moe_parallel_config=self.moe_parallel_config, + placement_strategy=self.expert_placement_strategy, + enable_eplb=self.enable_eplb, + num_fused_shared_experts=self.num_fused_shared_experts, + rocm_aiter_enabled=self.rocm_aiter_fmoe_enabled, + device=None, + ) - self._expert_map: torch.Tensor | None - local_num_experts, expert_map, expert_mask = determine_expert_map( - ep_size=self.ep_size, - ep_rank=self.ep_rank, - global_num_experts=self.global_num_experts, - expert_placement_strategy=self.expert_placement_strategy, - num_fused_shared_experts=self.num_fused_shared_experts, - return_expert_mask=self.rocm_aiter_fmoe_enabled, - ) - self.local_num_experts = local_num_experts - self.register_buffer("_expert_map", expert_map) - self.register_buffer("expert_mask", expert_mask) - self._maybe_init_expert_routing_tables() - logger.info_once( - "[EP Rank %s/%s] Expert parallelism is enabled. Expert " - "placement strategy: %s. Local/global" - " number of experts: %s/%s. Experts local to global index map:" - " %s.", - self.ep_rank, - self.ep_size, - self.expert_placement_strategy, - self.local_num_experts, - self.global_num_experts, - get_compressed_expert_map(self._expert_map), - ) - else: - self.local_num_experts, self._expert_map, self.expert_mask = ( - self.global_num_experts, - None, - None, - ) + # Extract properties from ExpertMapManager + self.local_num_experts = self.expert_map_manager.local_num_experts + self.expert_placement_strategy = self.expert_map_manager.placement_strategy + self.register_buffer("_expert_map", self.expert_map_manager.expert_map) + self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) self.top_k = top_k - self._init_aiter_shared_experts_topK_buffer( - vllm_config=vllm_config, dp_size=dp_size_ - ) + # AITER buffer initialization is handled by ExpertMapManager if self.use_ep and self.rocm_aiter_fmoe_enabled: assert self.expert_mask is None or torch.all( - (expert_mask == 0) | (expert_mask == 1) + (self.expert_mask == 0) | (self.expert_mask == 1) ), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s." assert intermediate_size % self.tp_size == 0 @@ -705,17 +539,10 @@ def _maybe_init_expert_routing_tables( ), ) - if self._expert_map is None: + routing_tables = self.expert_map_manager.routing_tables + if routing_tables is None: return None - routing_tables = self.ensure_round_robin_expert_routing_tables( - global_num_experts=self.global_num_experts, - ep_size=self.ep_size, - ep_rank=self.ep_rank, - local_num_experts=self.local_num_experts, - device=self._expert_map.device, - ) - global_to_physical, physical_to_global, local_global = routing_tables self.register_buffer("expert_global_to_physical", global_to_physical) self.register_buffer("expert_physical_to_global", physical_to_global) @@ -723,66 +550,28 @@ def _maybe_init_expert_routing_tables( return routing_tables - @staticmethod - def ensure_round_robin_expert_routing_tables( - global_num_experts: int, - ep_size: int, - ep_rank: int, - local_num_experts: int, - device: torch.device | None = None, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - device_kwargs = {"device": device} if device is not None else {} - global_indices = torch.arange( - global_num_experts, dtype=torch.long, **device_kwargs - ) - owner = torch.remainder(global_indices, ep_size) - local_index = torch.div(global_indices, ep_size, rounding_mode="floor") - base = global_num_experts // ep_size - remainder = global_num_experts % ep_size - physical_offset = owner * base - if remainder > 0: - remainder_tensor = torch.tensor( - remainder, dtype=torch.long, **device_kwargs - ) - physical_offset = physical_offset + torch.minimum(owner, remainder_tensor) - - global_to_physical = physical_offset + local_index - physical_to_global = torch.empty_like(global_to_physical) - physical_to_global[global_to_physical] = global_indices - - local_global = torch.arange( - ep_rank, - global_num_experts, - ep_size, - dtype=torch.long, - **device_kwargs, - ) - if local_global.numel() != local_num_experts: - local_global = local_global[:local_num_experts] - - return (global_to_physical, physical_to_global, local_global) - def update_expert_map(self): # ep_size and ep_rank should already be updated - assert self._expert_map is not None - with self._expert_map.device: - local_num_experts, expert_map, expert_mask = determine_expert_map( - ep_size=self.ep_size, - ep_rank=self.ep_rank, - global_num_experts=self.global_num_experts, - expert_placement_strategy=self.expert_placement_strategy, - num_fused_shared_experts=self.num_fused_shared_experts, - return_expert_mask=self.rocm_aiter_fmoe_enabled, - ) - self.local_num_experts = local_num_experts - self.register_buffer("_expert_map", expert_map) - self.register_buffer("expert_mask", expert_mask) - self._maybe_init_expert_routing_tables() - if self.aiter_fmoe_shared_expert_enabled: - self._init_aiter_shared_experts_topK_buffer( - vllm_config=get_current_vllm_config(), - dp_size=get_dp_group().world_size, - ) + # Update ExpertMapManager with new EP configuration + vllm_config = get_current_vllm_config() + self.expert_map_manager.update( + new_ep_size=self.ep_size, + new_ep_rank=self.ep_rank, + dp_size=get_dp_group().world_size + if self.aiter_fmoe_shared_expert_enabled + else None, + top_k=self.top_k if self.aiter_fmoe_shared_expert_enabled else None, + max_num_batched_tokens=vllm_config.scheduler_config.max_num_batched_tokens + if self.aiter_fmoe_shared_expert_enabled + else None, + ) + + # Update local attributes from ExpertMapManager + self.local_num_experts = self.expert_map_manager.local_num_experts + self.expert_placement_strategy = self.expert_map_manager.placement_strategy + self.register_buffer("_expert_map", self.expert_map_manager.expert_map) + self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) + self._maybe_init_expert_routing_tables() def _load_per_tensor_weight_scale( self, @@ -1050,26 +839,7 @@ def _load_g_idx( expert_data.copy_(loaded_weight) def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: - if self._expert_map is None: - return expert_id - return self._expert_map[expert_id].item() - - def _init_aiter_shared_experts_topK_buffer( - self, vllm_config: VllmConfig, dp_size: int - ): - if self.num_fused_shared_experts > 0: - init_aiter_topK_meta_data( - n_routed_experts=self.global_num_experts, - n_shared_experts=self.num_fused_shared_experts, - top_k=self.top_k, - tp_rank=self.ep_rank if self.use_ep else self.tp_rank, - tp_size=self.ep_size if self.use_ep else self.tp_size, - shared_experts_score=1.0, - max_num_tokens=vllm_config.scheduler_config.max_num_batched_tokens - * dp_size, - is_EP=self.use_ep, - ) - self.local_num_experts += self.num_fused_shared_experts + return self.expert_map_manager.map_global_to_local(expert_id) @overload def weight_loader( From 64cf4acd00ef0e02bc68921dd4ab47ca7688765d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 27 Apr 2026 20:01:02 +0000 Subject: [PATCH 03/21] update Signed-off-by: Bill Nell --- .../layers/fused_moe/expert_map_manager.py | 17 ++++++++++--- vllm/model_executor/layers/fused_moe/layer.py | 24 +++++++++++++------ 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index e587a854ea23..5f2cac803ee3 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -476,8 +476,13 @@ def _calculate_expert_maps(self) -> None: if self._expert_mask is not None: self._expert_mask = self._expert_mask.to(self.device) - def _maybe_init_routing_tables(self) -> None: - """Initialize routing tables if needed for round-robin.""" + def ensure_routing_tables_initialized(self) -> None: + """ + Ensure routing tables are initialized if needed for round-robin. + + This is a public method that can be called to explicitly initialize + routing tables. It's safe to call multiple times (idempotent). + """ if self._placement_strategy != "round_robin": return @@ -490,7 +495,13 @@ def _maybe_init_routing_tables(self) -> None: if self._expert_map is None: return - self._routing_tables = self._ensure_round_robin_expert_routing_tables() + # Only initialize if not already initialized + if not hasattr(self, "_routing_tables"): + self._routing_tables = self._ensure_round_robin_expert_routing_tables() + + def _maybe_init_routing_tables(self): + """Initialize routing tables if needed for round-robin (internal).""" + self.ensure_routing_tables_initialized() def _ensure_round_robin_expert_routing_tables( self, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 5fc7c6cc5467..5dda85d74c15 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -262,12 +262,6 @@ def __init__( self.top_k = top_k - # AITER buffer initialization is handled by ExpertMapManager - if self.use_ep and self.rocm_aiter_fmoe_enabled: - assert self.expert_mask is None or torch.all( - (self.expert_mask == 0) | (self.expert_mask == 1) - ), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s." - assert intermediate_size % self.tp_size == 0 intermediate_size_per_partition = intermediate_size // self.tp_size self.renormalize = renormalize @@ -539,10 +533,15 @@ def _maybe_init_expert_routing_tables( ), ) + # Explicitly ensure routing tables are initialized in ExpertMapManager + self.expert_map_manager._maybe_init_routing_tables() + + # Get routing tables from ExpertMapManager routing_tables = self.expert_map_manager.routing_tables if routing_tables is None: return None + # Register routing tables as buffers for this layer global_to_physical, physical_to_global, local_global = routing_tables self.register_buffer("expert_global_to_physical", global_to_physical) self.register_buffer("expert_physical_to_global", physical_to_global) @@ -553,6 +552,9 @@ def _maybe_init_expert_routing_tables( def update_expert_map(self): # ep_size and ep_rank should already be updated # Update ExpertMapManager with new EP configuration + # Note: ExpertMapManager.update() recalculates expert maps and + # reinitializes routing tables internally, so no need to call + # _maybe_init_expert_routing_tables() again vllm_config = get_current_vllm_config() self.expert_map_manager.update( new_ep_size=self.ep_size, @@ -571,7 +573,15 @@ def update_expert_map(self): self.expert_placement_strategy = self.expert_map_manager.placement_strategy self.register_buffer("_expert_map", self.expert_map_manager.expert_map) self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) - self._maybe_init_expert_routing_tables() + + # Update routing table buffers if they exist + # Note: Routing tables are already initialized by ExpertMapManager.update() + routing_tables = self.expert_map_manager.routing_tables + if routing_tables is not None: + global_to_physical, physical_to_global, local_global = routing_tables + self.register_buffer("expert_global_to_physical", global_to_physical) + self.register_buffer("expert_physical_to_global", physical_to_global) + self.register_buffer("expert_local_to_global", local_global) def _load_per_tensor_weight_scale( self, From 42c7fc4b009c4bb82d9b36a7723116c2ad31b749 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 27 Apr 2026 20:12:40 +0000 Subject: [PATCH 04/21] merge Signed-off-by: Bill Nell --- .../layers/fused_moe/expert_map_manager.py | 6 ++---- vllm/model_executor/layers/fused_moe/layer.py | 12 ++++-------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 5f2cac803ee3..7438c8330c85 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -483,13 +483,11 @@ def ensure_routing_tables_initialized(self) -> None: This is a public method that can be called to explicitly initialize routing tables. It's safe to call multiple times (idempotent). """ + # Only needed for round-robin with DeepEP-ll or NIXL EP backends if self._placement_strategy != "round_robin": return - if ( - not self.moe_parallel_config.use_deepep_ll_kernels - and not self.moe_parallel_config.use_nixl_ep_kernels - ): + if not self.moe_parallel_config.needs_round_robin_routing_tables: return if self._expert_map is None: diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 5dda85d74c15..13610e1ef3bd 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -516,13 +516,7 @@ def is_internal_router(self) -> bool: def _maybe_init_expert_routing_tables( self, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: - # Currently routing_tables only needed for round-robin expert placement - # with DeepEP-ll or NIXL EP all2all backends. - if self.expert_placement_strategy != "round_robin" or ( - not self.moe_parallel_config.needs_round_robin_routing_tables - ): - return None - + # Return cached routing tables if already registered as buffers if hasattr(self, "expert_global_to_physical"): return cast( tuple[torch.Tensor, torch.Tensor, torch.Tensor], @@ -533,7 +527,9 @@ def _maybe_init_expert_routing_tables( ), ) - # Explicitly ensure routing tables are initialized in ExpertMapManager + # Delegate to ExpertMapManager to initialize routing tables if needed + # (ExpertMapManager determines if routing tables are needed based on + # placement strategy and backend configuration) self.expert_map_manager._maybe_init_routing_tables() # Get routing tables from ExpertMapManager From 2332fd70bf3292798a47477ee9e929602da3caa4 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 27 Apr 2026 22:18:07 +0000 Subject: [PATCH 05/21] fix num_local_expert update Signed-off-by: Bill Nell --- .../model_executor/layers/fused_moe/expert_map_manager.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 7438c8330c85..ae788d81c77e 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -262,7 +262,6 @@ def _init_aiter_shared_experts_topK_buffer( max_num_tokens=max_num_batched_tokens * dp_size, is_EP=self.use_ep, ) - self._local_num_experts += self.num_fused_shared_experts @property def use_ep(self) -> int: @@ -391,6 +390,7 @@ def update( self._placement_strategy = self._determine_placement_strategy( self._placement_strategy ) + self._calculate_expert_maps() self._maybe_init_routing_tables() @@ -469,6 +469,8 @@ def _calculate_expert_maps(self) -> None: return_expert_mask=self.rocm_aiter_enabled, ) + self._local_num_experts += self.num_fused_shared_experts + # Move to device if specified if self.device is not None: if self._expert_map is not None: @@ -505,6 +507,10 @@ def _ensure_round_robin_expert_routing_tables( self, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """Build routing tables for round-robin placement.""" + assert self.num_fused_shared_experts == 0, ( + "Round robin not supported for AITER." + ) + device_kwargs = {"device": self.device} if self.device is not None else {} global_indices = torch.arange( From aa210d3f3d8625d5e7a5ba0a5c9ceba9807ccaa3 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 28 Apr 2026 22:33:56 +0000 Subject: [PATCH 06/21] fix Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/expert_map_manager.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index ae788d81c77e..ba964e1ab7b7 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -135,8 +135,7 @@ def determine_expert_placement_strategy( return "linear" if ( moe_parallel_config.use_all2all_kernels - and not moe_parallel_config.use_deepep_ll_kernels - and not moe_parallel_config.use_nixl_ep_kernels + and not moe_parallel_config.needs_round_robin_routing_tables ): logger.warning( "Round-robin expert placement currently only supports " @@ -435,8 +434,7 @@ def _determine_placement_strategy( if ( self.moe_parallel_config.use_all2all_kernels - and not self.moe_parallel_config.use_deepep_ll_kernels - and not self.moe_parallel_config.use_nixl_ep_kernels + and not self.moe_parallel_config.needs_round_robin_routing_tables ): logger.warning( "Round-robin placement requires DeepEP-ll or NIXL backend. " From c74f2856175c6bc99e645dfa711ac34f243b558d Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 29 Apr 2026 19:53:58 +0000 Subject: [PATCH 07/21] try to fix doc Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/expert_map_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index ba964e1ab7b7..0e204f4d1fb8 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -378,7 +378,7 @@ def update( dp_size: New DP size (if changed, for AITER buffer reinitialization) top_k: New top_k (if changed, for AITER buffer reinitialization) max_num_batched_tokens: New max batched tokens (if changed, for AITER - buffer reinitialization) + buffer reinitialization) """ if new_ep_size is not None: self.moe_parallel_config.ep_size = new_ep_size From c1a332c144f33f0a29f61514c50e68d611f690e0 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 May 2026 19:01:16 +0000 Subject: [PATCH 08/21] review comments Signed-off-by: Bill Nell --- .../layers/fused_moe/expert_map_manager.py | 75 ++++++++++--------- vllm/model_executor/layers/fused_moe/layer.py | 1 - 2 files changed, 40 insertions(+), 36 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 0e204f4d1fb8..4bdda09cc151 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -173,7 +173,6 @@ def __init__( enable_eplb: bool, num_fused_shared_experts: int = 0, rocm_aiter_enabled: bool = False, - device: torch.device | None = None, ): """ Initialize expert map manager. @@ -186,14 +185,12 @@ def __init__( placement_strategy: Strategy for placing experts ('linear' or 'round_robin') num_fused_shared_experts: Number of fused shared experts (for AITER) rocm_aiter_enabled: Whether ROCm AITER fusion is enabled - device: Device for tensor allocations """ self.global_num_experts = global_num_experts self.logical_num_experts = logical_num_experts self.moe_parallel_config = moe_parallel_config self.num_fused_shared_experts = num_fused_shared_experts self.rocm_aiter_enabled = rocm_aiter_enabled - self.device = device if moe_parallel_config.use_ep: # Determine expert placement strategy before creating manager @@ -229,7 +226,7 @@ def __init__( (expert_mask == 0) | (expert_mask == 1) ), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s." - # Log EP configuration (move into EMM?) + # Log EP configuration if self.use_ep: logger.info_once( "[EP Rank %s/%s] Expert parallelism is enabled. Expert " @@ -244,6 +241,15 @@ def __init__( self.get_compressed_map_string(), ) + @property + def device(self) -> torch.device: + if self._expert_map is not None: + return self._expert_map.device + elif self._expert_mask is not None: + return self._expert_mask.device + else: + raise RuntimeError("no device available") + def _init_aiter_shared_experts_topK_buffer( self, dp_size: int, @@ -380,28 +386,29 @@ def update( max_num_batched_tokens: New max batched tokens (if changed, for AITER buffer reinitialization) """ - if new_ep_size is not None: - self.moe_parallel_config.ep_size = new_ep_size - if new_ep_rank is not None: - self.moe_parallel_config.ep_rank = new_ep_rank + with self.device: + if new_ep_size is not None: + self.moe_parallel_config.ep_size = new_ep_size + if new_ep_rank is not None: + self.moe_parallel_config.ep_rank = new_ep_rank + + # Recalculate everything + self._placement_strategy = self._determine_placement_strategy( + self._placement_strategy + ) - # Recalculate everything - self._placement_strategy = self._determine_placement_strategy( - self._placement_strategy - ) + self._calculate_expert_maps() + self._maybe_init_routing_tables() - self._calculate_expert_maps() - self._maybe_init_routing_tables() - - # Reinitialize AITER buffer if needed and parameters provided - if self.num_fused_shared_experts > 0 and all( - x is not None for x in [dp_size, top_k, max_num_batched_tokens] - ): - self._init_aiter_shared_experts_topK_buffer( - dp_size=dp_size, # type: ignore - top_k=top_k, # type: ignore - max_num_batched_tokens=max_num_batched_tokens, # type: ignore - ) + # Reinitialize AITER buffer if needed and parameters provided + if self.num_fused_shared_experts > 0 and all( + x is not None for x in [dp_size, top_k, max_num_batched_tokens] + ): + self._init_aiter_shared_experts_topK_buffer( + dp_size=dp_size, # type: ignore + top_k=top_k, # type: ignore + max_num_batched_tokens=max_num_batched_tokens, # type: ignore + ) def get_compressed_map_string(self) -> str: """ @@ -469,13 +476,6 @@ def _calculate_expert_maps(self) -> None: self._local_num_experts += self.num_fused_shared_experts - # Move to device if specified - if self.device is not None: - if self._expert_map is not None: - self._expert_map = self._expert_map.to(self.device) - if self._expert_mask is not None: - self._expert_mask = self._expert_mask.to(self.device) - def ensure_routing_tables_initialized(self) -> None: """ Ensure routing tables are initialized if needed for round-robin. @@ -509,10 +509,13 @@ def _ensure_round_robin_expert_routing_tables( "Round robin not supported for AITER." ) - device_kwargs = {"device": self.device} if self.device is not None else {} + assert self._expert_map is not None + device = self._expert_map.device global_indices = torch.arange( - self.global_num_experts, dtype=torch.long, **device_kwargs + self.global_num_experts, + dtype=torch.long, + device=device, ) owner = torch.remainder(global_indices, self.ep_size) local_index = torch.div(global_indices, self.ep_size, rounding_mode="floor") @@ -523,7 +526,9 @@ def _ensure_round_robin_expert_routing_tables( if remainder > 0: remainder_tensor = torch.tensor( - remainder, dtype=torch.long, **device_kwargs + remainder, + dtype=torch.long, + device=device, ) physical_offset = physical_offset + torch.minimum(owner, remainder_tensor) @@ -536,7 +541,7 @@ def _ensure_round_robin_expert_routing_tables( self.global_num_experts, self.ep_size, dtype=torch.long, - **device_kwargs, + device=device, ) if local_global.numel() != self._local_num_experts: local_global = local_global[: self._local_num_experts] diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 13610e1ef3bd..2eee060ee735 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -251,7 +251,6 @@ def __init__( enable_eplb=self.enable_eplb, num_fused_shared_experts=self.num_fused_shared_experts, rocm_aiter_enabled=self.rocm_aiter_fmoe_enabled, - device=None, ) # Extract properties from ExpertMapManager From 4a3d996fb21835cbbc2c1289a1f38109903b3a6e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 May 2026 19:52:37 +0000 Subject: [PATCH 09/21] cleanup routing table initialization and updating Signed-off-by: Bill Nell --- tests/kernels/moe/test_moe.py | 2 +- .../layers/fused_moe/expert_map_manager.py | 10 +-- vllm/model_executor/layers/fused_moe/layer.py | 63 +++++++------------ .../fused_moe/unquantized_fused_moe_method.py | 2 +- .../compressed_tensors_moe_w4a4_mxfp4.py | 2 +- .../compressed_tensors_moe_w4a4_nvfp4.py | 2 +- .../compressed_tensors_moe_w8a8_fp8.py | 2 +- .../compressed_tensors_moe_w8a8_int8.py | 2 +- .../compressed_tensors_moe_w8a8_mxfp8.py | 2 +- .../model_executor/layers/quantization/fp8.py | 2 +- .../layers/quantization/gptq_marlin.py | 2 +- .../layers/quantization/modelopt.py | 4 +- .../layers/quantization/mxfp4.py | 4 +- .../layers/quantization/online/fp8.py | 2 +- .../layers/quantization/online/int8.py | 2 +- .../layers/quantization/online/mxfp8.py | 2 +- .../layers/quantization/quark/quark_moe.py | 2 +- 17 files changed, 42 insertions(+), 65 deletions(-) diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index ebc3256b548f..3978f1cbe7af 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -1588,7 +1588,7 @@ def test_unquantized_bf16_flashinfer_trtllm_backend( layer.apply_router_weight_on_input = False layer.routed_scaling_factor = None layer.shared_experts = None - layer._maybe_init_expert_routing_tables = lambda: None + layer._expert_routing_tables = lambda: None quant_method.process_weights_after_loading(layer) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 4bdda09cc151..79b1f0aa70b7 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -212,7 +212,7 @@ def __init__( self._calculate_expert_maps() # Initialize routing tables if needed - self._maybe_init_routing_tables() + self._ensure_routing_tables_initialized() self._init_aiter_shared_experts_topK_buffer( dp_size=self.moe_parallel_config.dp_size, @@ -398,7 +398,7 @@ def update( ) self._calculate_expert_maps() - self._maybe_init_routing_tables() + self._ensure_routing_tables_initialized() # Reinitialize AITER buffer if needed and parameters provided if self.num_fused_shared_experts > 0 and all( @@ -476,7 +476,7 @@ def _calculate_expert_maps(self) -> None: self._local_num_experts += self.num_fused_shared_experts - def ensure_routing_tables_initialized(self) -> None: + def _ensure_routing_tables_initialized(self) -> None: """ Ensure routing tables are initialized if needed for round-robin. @@ -497,10 +497,6 @@ def ensure_routing_tables_initialized(self) -> None: if not hasattr(self, "_routing_tables"): self._routing_tables = self._ensure_round_robin_expert_routing_tables() - def _maybe_init_routing_tables(self): - """Initialize routing tables if needed for round-robin (internal).""" - self.ensure_routing_tables_initialized() - def _ensure_round_robin_expert_routing_tables( self, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 2eee060ee735..f77c4e9b52db 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -253,11 +253,7 @@ def __init__( rocm_aiter_enabled=self.rocm_aiter_fmoe_enabled, ) - # Extract properties from ExpertMapManager - self.local_num_experts = self.expert_map_manager.local_num_experts - self.expert_placement_strategy = self.expert_map_manager.placement_strategy - self.register_buffer("_expert_map", self.expert_map_manager.expert_map) - self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) + self.update_expert_map_info() self.top_k = top_k @@ -458,9 +454,8 @@ def maybe_init_modular_kernel(self) -> None: self.ensure_moe_quant_config_init() # routing_tables only needed for round-robin expert placement with # DeepEP all2all backend. - routing_tables = self._maybe_init_expert_routing_tables() prepare_finalize = self.base_quant_method.maybe_make_prepare_finalize( - routing_tables=routing_tables + routing_tables=self._expert_routing_tables() ) if prepare_finalize is not None: logger.debug( @@ -512,7 +507,23 @@ def is_internal_router(self) -> bool: # By default, router/gate is called before FusedMoE forward pass return self.runner.is_internal_router() - def _maybe_init_expert_routing_tables( + def update_expert_map_info(self): + # Update local attributes from ExpertMapManager + self.local_num_experts = self.expert_map_manager.local_num_experts + self.expert_placement_strategy = self.expert_map_manager.placement_strategy + self.register_buffer("_expert_map", self.expert_map_manager.expert_map) + self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) + + # Get routing tables from ExpertMapManager + routing_tables = self.expert_map_manager.routing_tables + if routing_tables is not None: + # Register routing tables as buffers for this layer + global_to_physical, physical_to_global, local_global = routing_tables + self.register_buffer("expert_global_to_physical", global_to_physical) + self.register_buffer("expert_physical_to_global", physical_to_global) + self.register_buffer("expert_local_to_global", local_global) + + def _expert_routing_tables( self, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: # Return cached routing tables if already registered as buffers @@ -525,31 +536,13 @@ def _maybe_init_expert_routing_tables( self.expert_local_to_global, ), ) - - # Delegate to ExpertMapManager to initialize routing tables if needed - # (ExpertMapManager determines if routing tables are needed based on - # placement strategy and backend configuration) - self.expert_map_manager._maybe_init_routing_tables() - - # Get routing tables from ExpertMapManager - routing_tables = self.expert_map_manager.routing_tables - if routing_tables is None: - return None - - # Register routing tables as buffers for this layer - global_to_physical, physical_to_global, local_global = routing_tables - self.register_buffer("expert_global_to_physical", global_to_physical) - self.register_buffer("expert_physical_to_global", physical_to_global) - self.register_buffer("expert_local_to_global", local_global) - - return routing_tables + return None def update_expert_map(self): # ep_size and ep_rank should already be updated # Update ExpertMapManager with new EP configuration # Note: ExpertMapManager.update() recalculates expert maps and - # reinitializes routing tables internally, so no need to call - # _maybe_init_expert_routing_tables() again + # reinitializes routing tables internally. vllm_config = get_current_vllm_config() self.expert_map_manager.update( new_ep_size=self.ep_size, @@ -564,19 +557,7 @@ def update_expert_map(self): ) # Update local attributes from ExpertMapManager - self.local_num_experts = self.expert_map_manager.local_num_experts - self.expert_placement_strategy = self.expert_map_manager.placement_strategy - self.register_buffer("_expert_map", self.expert_map_manager.expert_map) - self.register_buffer("expert_mask", self.expert_map_manager.expert_mask) - - # Update routing table buffers if they exist - # Note: Routing tables are already initialized by ExpertMapManager.update() - routing_tables = self.expert_map_manager.routing_tables - if routing_tables is not None: - global_to_physical, physical_to_global, local_global = routing_tables - self.register_buffer("expert_global_to_physical", global_to_physical) - self.register_buffer("expert_physical_to_global", physical_to_global) - self.register_buffer("expert_local_to_global", local_global) + self.update_expert_map_info() def _load_per_tensor_weight_scale( self, diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 89697033403d..8096179a3b5a 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -172,7 +172,7 @@ def _setup_kernel( moe_config=self.moe, backend=self.unquantized_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py index 629e1c5ef1be..01ffdfae0567 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py @@ -194,7 +194,7 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: experts_cls=self.experts_cls, mxfp4_backend=self.mxfp4_backend, shared_experts=layer.shared_experts, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), ) def apply( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_nvfp4.py index 29c673d0f6e3..46b7db1f0475 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_nvfp4.py @@ -236,7 +236,7 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: moe_config=self.moe, experts_cls=self.experts_cls, shared_experts=layer.shared_experts, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), ) self.moe_kernel.fused_experts.process_weights_after_loading(layer) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_fp8.py index bba7e0e7abce..433f7a5c76a7 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_fp8.py @@ -336,7 +336,7 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: moe_config=self.moe, fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_int8.py index bad5b3895b8f..d39dbee747c0 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_int8.py @@ -147,7 +147,7 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: moe_quant_config=self.moe_quant_config, moe_config=self.moe, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_mxfp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_mxfp8.py index ecd0b54890d1..219a0526c481 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_mxfp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w8a8_mxfp8.py @@ -138,7 +138,7 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: moe_config=self.moe, fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 1c9237d3f60a..58000c165947 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -774,7 +774,7 @@ def _setup_kernel( moe_config=self.moe, fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 7b6f1f9cf6cd..0156744fcc42 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -769,7 +769,7 @@ def _setup_kernel(self, layer: FusedMoE) -> None: w2_g_idx=layer.w2_g_idx, w13_g_idx_sort_indices=layer.w13_g_idx_sort_indices, w2_g_idx_sort_indices=layer.w2_g_idx_sort_indices, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 242cc105e470..f9b65b5c77e5 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -896,7 +896,7 @@ def _setup_kernel( moe_config=self.moe, fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) @@ -1419,7 +1419,7 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: moe_config=self.moe, experts_cls=self.experts_cls, shared_experts=layer.shared_experts, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), ) self.moe_kernel.fused_experts.process_weights_after_loading(layer) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 0a516831c4ec..f8c7711ccb4e 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -364,7 +364,7 @@ def _setup_kernel( moe_config=self.moe, mxfp4_backend=self.mxfp4_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) @@ -690,7 +690,7 @@ def _setup_kernel( moe_config=self.moe, mxfp4_backend=self.mxfp4_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/online/fp8.py b/vllm/model_executor/layers/quantization/online/fp8.py index 9cb697289d7e..dfcff1e21685 100644 --- a/vllm/model_executor/layers/quantization/online/fp8.py +++ b/vllm/model_executor/layers/quantization/online/fp8.py @@ -348,7 +348,7 @@ def _setup_kernel( moe_config=self.moe, fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/online/int8.py b/vllm/model_executor/layers/quantization/online/int8.py index 4b4c87fbce96..f4d2f9a2a371 100644 --- a/vllm/model_executor/layers/quantization/online/int8.py +++ b/vllm/model_executor/layers/quantization/online/int8.py @@ -99,7 +99,7 @@ def _setup_kernel(self, layer: "FusedMoE") -> None: moe_quant_config=self.moe_quant_config, moe_config=self.moe, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/online/mxfp8.py b/vllm/model_executor/layers/quantization/online/mxfp8.py index 39a32604442c..312da8a12158 100644 --- a/vllm/model_executor/layers/quantization/online/mxfp8.py +++ b/vllm/model_executor/layers/quantization/online/mxfp8.py @@ -199,7 +199,7 @@ def _setup_kernel( moe_config=self.moe, fp8_backend=self.fp8_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index d92acb85c265..9d9397e29f8d 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -1350,7 +1350,7 @@ def _setup_kernel_via_oracle(self, layer: FusedMoE): moe_config=self.moe, mxfp4_backend=self.mxfp4_backend, experts_cls=self.experts_cls, - routing_tables=layer._maybe_init_expert_routing_tables(), + routing_tables=layer._expert_routing_tables(), shared_experts=layer.shared_experts, ) From 778c141a79e3db62ac6d1aa0bc71b5cfef6db7f5 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 May 2026 19:56:26 +0000 Subject: [PATCH 10/21] fix local_num_experts Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/expert_map_manager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 79b1f0aa70b7..f7c6f88077a1 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -455,7 +455,9 @@ def _calculate_expert_maps(self) -> None: """Calculate expert mappings based on placement strategy.""" if self.ep_size == 1: # No EP, all experts are local - self._local_num_experts = self.global_num_experts + self._local_num_experts = ( + self.global_num_experts + self.num_fused_shared_experts + ) self._expert_map = None self._expert_mask = None return From 3c21f32381158c3037d24e15e90942c525e43a85 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 May 2026 22:00:21 +0000 Subject: [PATCH 11/21] tweak Signed-off-by: Bill Nell --- .../layers/fused_moe/expert_map_manager.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index f7c6f88077a1..c5c02bd6c512 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -70,6 +70,7 @@ def determine_expert_map( # Create a tensor of size num_experts filled with -1 expert_map = torch.full((global_num_experts,), -1, dtype=torch.int32) + # Create an expert map for the local experts if expert_placement_strategy == "linear": start_idx = ep_rank * base_experts + min(ep_rank, remainder) @@ -507,13 +508,10 @@ def _ensure_round_robin_expert_routing_tables( "Round robin not supported for AITER." ) - assert self._expert_map is not None - device = self._expert_map.device - global_indices = torch.arange( self.global_num_experts, dtype=torch.long, - device=device, + device=self.device, ) owner = torch.remainder(global_indices, self.ep_size) local_index = torch.div(global_indices, self.ep_size, rounding_mode="floor") @@ -526,7 +524,7 @@ def _ensure_round_robin_expert_routing_tables( remainder_tensor = torch.tensor( remainder, dtype=torch.long, - device=device, + device=self.device, ) physical_offset = physical_offset + torch.minimum(owner, remainder_tensor) @@ -539,7 +537,7 @@ def _ensure_round_robin_expert_routing_tables( self.global_num_experts, self.ep_size, dtype=torch.long, - device=device, + device=self.device, ) if local_global.numel() != self._local_num_experts: local_global = local_global[: self._local_num_experts] From 2dd2ea9219b5d741fe811571b5d686cf9e0397d4 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 May 2026 22:49:08 +0000 Subject: [PATCH 12/21] try to fix update_expert_map Signed-off-by: Bill Nell --- .../layers/fused_moe/expert_map_manager.py | 60 +++++-------------- vllm/model_executor/layers/fused_moe/layer.py | 23 +++---- 2 files changed, 24 insertions(+), 59 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index c5c02bd6c512..6f89dc387eb9 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -166,7 +166,6 @@ def __init__( max_num_batched_tokens: int, top_k: int, global_num_experts: int, - logical_num_experts: int, num_redundant_experts: int, num_expert_group: int | None, moe_parallel_config: FusedMoEParallelConfig, @@ -180,7 +179,6 @@ def __init__( Args: global_num_experts: Total number of experts across all ranks - logical_num_experts: Number of logical (non-redundant) experts moe_parallel_config: MoE parallel configuration (contains ep_size, ep_rank, backend flags) placement_strategy: Strategy for placing experts ('linear' or 'round_robin') @@ -188,10 +186,11 @@ def __init__( rocm_aiter_enabled: Whether ROCm AITER fusion is enabled """ self.global_num_experts = global_num_experts - self.logical_num_experts = logical_num_experts self.moe_parallel_config = moe_parallel_config self.num_fused_shared_experts = num_fused_shared_experts self.rocm_aiter_enabled = rocm_aiter_enabled + self.top_k = top_k + self.max_num_batched_tokens = max_num_batched_tokens if moe_parallel_config.use_ep: # Determine expert placement strategy before creating manager @@ -215,11 +214,7 @@ def __init__( # Initialize routing tables if needed self._ensure_routing_tables_initialized() - self._init_aiter_shared_experts_topK_buffer( - dp_size=self.moe_parallel_config.dp_size, - top_k=top_k, - max_num_batched_tokens=max_num_batched_tokens, - ) + self._init_aiter_shared_experts_topK_buffer() if self.use_ep and self.rocm_aiter_enabled: expert_mask = self.expert_mask @@ -251,21 +246,17 @@ def device(self) -> torch.device: else: raise RuntimeError("no device available") - def _init_aiter_shared_experts_topK_buffer( - self, - dp_size: int, - top_k: int, - max_num_batched_tokens: int, - ): + def _init_aiter_shared_experts_topK_buffer(self): if self.num_fused_shared_experts > 0: + dp_size = self.moe_parallel_config.dp_size init_aiter_topK_meta_data( n_routed_experts=self.global_num_experts, n_shared_experts=self.num_fused_shared_experts, - top_k=top_k, + top_k=self.top_k, tp_rank=self.ep_rank if self.use_ep else self.tp_rank, tp_size=self.ep_size if self.use_ep else self.tp_size, shared_experts_score=1.0, - max_num_tokens=max_num_batched_tokens * dp_size, + max_num_tokens=self.max_num_batched_tokens * dp_size, is_EP=self.use_ep, ) @@ -368,11 +359,9 @@ def get_local_expert_ids(self) -> list[int]: def update( self, - new_ep_size: int | None = None, - new_ep_rank: int | None = None, - dp_size: int | None = None, - top_k: int | None = None, - max_num_batched_tokens: int | None = None, + moe_parallel_config: FusedMoEParallelConfig, + global_num_experts: int, + num_fused_shared_experts: int, ) -> None: """ Update expert mappings for new EP configuration. @@ -380,36 +369,17 @@ def update( Used during dynamic reconfiguration (e.g., elastic scaling). Args: - new_ep_size: New EP world size (if changed) - new_ep_rank: New EP rank (if changed) - dp_size: New DP size (if changed, for AITER buffer reinitialization) - top_k: New top_k (if changed, for AITER buffer reinitialization) - max_num_batched_tokens: New max batched tokens (if changed, for AITER - buffer reinitialization) """ - with self.device: - if new_ep_size is not None: - self.moe_parallel_config.ep_size = new_ep_size - if new_ep_rank is not None: - self.moe_parallel_config.ep_rank = new_ep_rank - - # Recalculate everything - self._placement_strategy = self._determine_placement_strategy( - self._placement_strategy - ) + self.moe_parallel_config = moe_parallel_config + self.global_num_experts = global_num_experts + self.num_fused_shared_experts = num_fused_shared_experts + with self.device: self._calculate_expert_maps() self._ensure_routing_tables_initialized() # Reinitialize AITER buffer if needed and parameters provided - if self.num_fused_shared_experts > 0 and all( - x is not None for x in [dp_size, top_k, max_num_batched_tokens] - ): - self._init_aiter_shared_experts_topK_buffer( - dp_size=dp_size, # type: ignore - top_k=top_k, # type: ignore - max_num_batched_tokens=max_num_batched_tokens, # type: ignore - ) + self._init_aiter_shared_experts_topK_buffer() def get_compressed_map_string(self) -> str: """ diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 49473d6316c0..16b95635ac82 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -238,12 +238,13 @@ def __init__( "Redundant experts are only supported with EPLB." ) + max_num_batched_tokens = vllm_config.scheduler_config.max_num_batched_tokens + # Create ExpertMapManager to handle expert mapping and placement self.expert_map_manager = ExpertMapManager( - max_num_batched_tokens=vllm_config.scheduler_config.max_num_batched_tokens, + max_num_batched_tokens=max_num_batched_tokens, top_k=top_k, global_num_experts=self.global_num_experts, - logical_num_experts=self.logical_num_experts, num_redundant_experts=num_redundant_experts, num_expert_group=num_expert_group, moe_parallel_config=self.moe_parallel_config, @@ -323,7 +324,7 @@ def __init__( in_dtype=moe_in_dtype, moe_backend=vllm_config.kernel_config.moe_backend, router_logits_dtype=router_logits_dtype, - max_num_tokens=vllm_config.scheduler_config.max_num_batched_tokens, + max_num_tokens=max_num_batched_tokens, has_bias=has_bias, is_act_and_mul=is_act_and_mul, is_lora_enabled=vllm_config.lora_config is not None, @@ -541,21 +542,15 @@ def _expert_routing_tables( return None def update_expert_map(self): - # ep_size and ep_rank should already be updated # Update ExpertMapManager with new EP configuration + # The moe_parallel_config (including ep_size and ep_rank) + # should already be updated. # Note: ExpertMapManager.update() recalculates expert maps and # reinitializes routing tables internally. - vllm_config = get_current_vllm_config() self.expert_map_manager.update( - new_ep_size=self.ep_size, - new_ep_rank=self.ep_rank, - dp_size=get_dp_group().world_size - if self.aiter_fmoe_shared_expert_enabled - else None, - top_k=self.top_k if self.aiter_fmoe_shared_expert_enabled else None, - max_num_batched_tokens=vllm_config.scheduler_config.max_num_batched_tokens - if self.aiter_fmoe_shared_expert_enabled - else None, + self.moe_parallel_config, + global_num_experts=self.global_num_experts, + num_fused_shared_experts=self.num_fused_shared_experts, ) # Update local attributes from ExpertMapManager From 979dd651b3c2a4d8bab95b9317822551cb51c44c Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 May 2026 23:00:39 +0000 Subject: [PATCH 13/21] remove unused arg Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/expert_map_manager.py | 2 -- vllm/model_executor/layers/fused_moe/layer.py | 1 - 2 files changed, 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 6f89dc387eb9..de48a887c431 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -361,7 +361,6 @@ def update( self, moe_parallel_config: FusedMoEParallelConfig, global_num_experts: int, - num_fused_shared_experts: int, ) -> None: """ Update expert mappings for new EP configuration. @@ -372,7 +371,6 @@ def update( """ self.moe_parallel_config = moe_parallel_config self.global_num_experts = global_num_experts - self.num_fused_shared_experts = num_fused_shared_experts with self.device: self._calculate_expert_maps() diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 16b95635ac82..d2f87e4eff3f 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -550,7 +550,6 @@ def update_expert_map(self): self.expert_map_manager.update( self.moe_parallel_config, global_num_experts=self.global_num_experts, - num_fused_shared_experts=self.num_fused_shared_experts, ) # Update local attributes from ExpertMapManager From 56d87385ffb0e693012993a8c8bd1ccdf6bd5a53 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 May 2026 23:02:21 +0000 Subject: [PATCH 14/21] update comment Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/expert_map_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index de48a887c431..801de110137d 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -368,6 +368,9 @@ def update( Used during dynamic reconfiguration (e.g., elastic scaling). Args: + global_num_experts: New total number of experts across all ranks + moe_parallel_config: New MoE parallel configuration (contains ep_size, + ep_rank, backend flags) """ self.moe_parallel_config = moe_parallel_config self.global_num_experts = global_num_experts From ba6118e5726cea2f1fcbce7c99b91184ff630cfc Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 7 May 2026 19:59:41 +0000 Subject: [PATCH 15/21] review comments Signed-off-by: Bill Nell --- .../layers/fused_moe/expert_map_manager.py | 45 ++++++++----------- vllm/model_executor/layers/fused_moe/layer.py | 2 - 2 files changed, 18 insertions(+), 29 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 801de110137d..29c3e48ea9ee 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -194,7 +194,6 @@ def __init__( if moe_parallel_config.use_ep: # Determine expert placement strategy before creating manager - # TODO move into EMM placement_strategy = determine_expert_placement_strategy( expert_placement_strategy=placement_strategy, moe_parallel_config=moe_parallel_config, @@ -212,7 +211,7 @@ def __init__( self._calculate_expert_maps() # Initialize routing tables if needed - self._ensure_routing_tables_initialized() + self._routing_tables = self._init_routing_tables() self._init_aiter_shared_experts_topK_buffer() @@ -237,15 +236,6 @@ def __init__( self.get_compressed_map_string(), ) - @property - def device(self) -> torch.device: - if self._expert_map is not None: - return self._expert_map.device - elif self._expert_mask is not None: - return self._expert_mask.device - else: - raise RuntimeError("no device available") - def _init_aiter_shared_experts_topK_buffer(self): if self.num_fused_shared_experts > 0: dp_size = self.moe_parallel_config.dp_size @@ -322,8 +312,6 @@ def routing_tables( Returns (global_to_physical, physical_to_global, local_to_global) or None if not using round-robin or tables not needed. """ - if not hasattr(self, "_routing_tables"): - return None return self._routing_tables def map_global_to_local(self, global_id: int) -> int: @@ -375,9 +363,16 @@ def update( self.moe_parallel_config = moe_parallel_config self.global_num_experts = global_num_experts - with self.device: + if self._expert_map is not None: + device = self._expert_map.device + elif self._expert_mask is not None: + device = self._expert_mask.device + else: + raise RuntimeError("no device available") + + with device: self._calculate_expert_maps() - self._ensure_routing_tables_initialized() + self._routing_tables = self._init_routing_tables() # Reinitialize AITER buffer if needed and parameters provided self._init_aiter_shared_experts_topK_buffer() @@ -450,28 +445,27 @@ def _calculate_expert_maps(self) -> None: self._local_num_experts += self.num_fused_shared_experts - def _ensure_routing_tables_initialized(self) -> None: + def _init_routing_tables( + self, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: """ Ensure routing tables are initialized if needed for round-robin. This is a public method that can be called to explicitly initialize routing tables. It's safe to call multiple times (idempotent). """ - # Only needed for round-robin with DeepEP-ll or NIXL EP backends if self._placement_strategy != "round_robin": - return + return None if not self.moe_parallel_config.needs_round_robin_routing_tables: - return + return None if self._expert_map is None: - return + return None - # Only initialize if not already initialized - if not hasattr(self, "_routing_tables"): - self._routing_tables = self._ensure_round_robin_expert_routing_tables() + return self._init_round_robin_expert_routing_tables() - def _ensure_round_robin_expert_routing_tables( + def _init_round_robin_expert_routing_tables( self, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """Build routing tables for round-robin placement.""" @@ -482,7 +476,6 @@ def _ensure_round_robin_expert_routing_tables( global_indices = torch.arange( self.global_num_experts, dtype=torch.long, - device=self.device, ) owner = torch.remainder(global_indices, self.ep_size) local_index = torch.div(global_indices, self.ep_size, rounding_mode="floor") @@ -495,7 +488,6 @@ def _ensure_round_robin_expert_routing_tables( remainder_tensor = torch.tensor( remainder, dtype=torch.long, - device=self.device, ) physical_offset = physical_offset + torch.minimum(owner, remainder_tensor) @@ -508,7 +500,6 @@ def _ensure_round_robin_expert_routing_tables( self.global_num_experts, self.ep_size, dtype=torch.long, - device=self.device, ) if local_global.numel() != self._local_num_experts: local_global = local_global[: self._local_num_experts] diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index d2f87e4eff3f..a6718a054779 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -455,8 +455,6 @@ def maybe_init_modular_kernel(self) -> None: return None self.ensure_moe_quant_config_init() - # routing_tables only needed for round-robin expert placement with - # DeepEP all2all backend. prepare_finalize = self.base_quant_method.maybe_make_prepare_finalize( routing_tables=self._expert_routing_tables() ) From ef6bdce461e7495983c402248bcf1d9c19bef0e8 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 8 May 2026 18:58:57 +0000 Subject: [PATCH 16/21] changing exception type/message for clarity Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/expert_map_manager.py | 2 +- vllm/model_executor/layers/fused_moe/layer.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 29c3e48ea9ee..2b0492387bbf 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -368,7 +368,7 @@ def update( elif self._expert_mask is not None: device = self._expert_mask.device else: - raise RuntimeError("no device available") + raise AssertionError("_expert_map or _expert_mask must be present.") with device: self._calculate_expert_maps() diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index a6718a054779..3959beb584ee 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -437,6 +437,8 @@ def _get_quant_method() -> FusedMoEMethodBase: else 1.0, ) + print(f"MOE layer id = {self.layer_id}") + # TODO(bnell): This method is provided as a hook so vllm/lora/layers/fused_moe.py # can safely swap out the quant_method. We should figure out a less # intrusive way to do this. From 587e8951206dd9634432e65501315257b924c97e Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Mon, 11 May 2026 11:24:53 -0400 Subject: [PATCH 17/21] Add import for ExpertMapManager in layer.py --- vllm/model_executor/layers/fused_moe/layer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 0d7ff703f86e..92861a184b4c 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -28,6 +28,7 @@ ) from vllm.model_executor.layers.fused_moe.expert_map_manager import ( ExpertMapManager, +) from vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe import ( init_aiter_topK_meta_data, ) From b66852f2d40696f6d9f17de908031c967709921e Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Mon, 11 May 2026 11:40:46 -0400 Subject: [PATCH 18/21] pre-commit fix Signed-off-by: Robert Shaw --- vllm/model_executor/layers/fused_moe/layer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 92861a184b4c..232093400fcb 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -29,9 +29,6 @@ from vllm.model_executor.layers.fused_moe.expert_map_manager import ( ExpertMapManager, ) -from vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe import ( - init_aiter_topK_meta_data, -) from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, ) From d30dde017b79fa38a052c5c5b4e0229a65791681 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 11 May 2026 16:51:00 +0000 Subject: [PATCH 19/21] fixes Signed-off-by: Bill Nell --- .../layers/fused_moe/expert_map_manager.py | 10 ---------- vllm/model_executor/layers/fused_moe/layer.py | 2 -- 2 files changed, 12 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 2b0492387bbf..c36668cad443 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -420,16 +420,6 @@ def _determine_placement_strategy( def _calculate_expert_maps(self) -> None: """Calculate expert mappings based on placement strategy.""" - if self.ep_size == 1: - # No EP, all experts are local - self._local_num_experts = ( - self.global_num_experts + self.num_fused_shared_experts - ) - self._expert_map = None - self._expert_mask = None - return - - # Call determine_expert_map with current config ( self._local_num_experts, self._expert_map, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 3959beb584ee..a6718a054779 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -437,8 +437,6 @@ def _get_quant_method() -> FusedMoEMethodBase: else 1.0, ) - print(f"MOE layer id = {self.layer_id}") - # TODO(bnell): This method is provided as a hook so vllm/lora/layers/fused_moe.py # can safely swap out the quant_method. We should figure out a less # intrusive way to do this. From f189b627bd6cd249fc0c05a03c6609b09edf8314 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 11 May 2026 16:53:02 +0000 Subject: [PATCH 20/21] fix import path Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/expert_map_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index c36668cad443..5e41375ab0d3 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -12,7 +12,7 @@ from vllm.config.parallel import ExpertPlacementStrategy from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig -from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( +from vllm.model_executor.layers.fused_moe.experts.rocm_aiter_moe import ( init_aiter_topK_meta_data, ) From b926c6671d0ec8937b2fbf45470a6d24ad824953 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 11 May 2026 16:57:38 +0000 Subject: [PATCH 21/21] add some comments Signed-off-by: Bill Nell --- .../layers/fused_moe/expert_map_manager.py | 19 +++++++++++++++++++ vllm/model_executor/layers/fused_moe/layer.py | 4 +++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/expert_map_manager.py b/vllm/model_executor/layers/fused_moe/expert_map_manager.py index 5e41375ab0d3..71f2186ea4dd 100644 --- a/vllm/model_executor/layers/fused_moe/expert_map_manager.py +++ b/vllm/model_executor/layers/fused_moe/expert_map_manager.py @@ -159,6 +159,25 @@ class ExpertMapManager: - Manage placement strategies (linear, round_robin) - Maintain routing tables for round-robin placement - Support dynamic reconfiguration of EP topology + + When expert_map is required: + - Expert Parallelism (EP) is enabled, i.e., when ep_size > 1 + - EP disabled (ep_size == 1): expert_map is None + * All experts are local to the current rank + * No mapping is needed + - EP enabled (ep_size > 1): expert_map is created + * Maps global expert IDs to local expert IDs + * Shape: (global_num_experts,) + * Contains the local expert index for experts on this rank, -1 for experts + on other ranks + * Used by kernels to handle distributed expert execution + - Kernel support varies: + * Supports expert_map: fused_moe, fused_marlin_moe, fused_humming_moe, + rocm_aiter_fused_moe, deep_gemm_moe, xpu_moe, gpt_oss_triton_kernels_moe + * Does not support: flashinfer_cutlass_moe, fused_batched_moe, most cutlass_moe + variants, trtllm_* kernels + * When kernel doesn't support expert_map: The modular kernel method sets + expert_map=None even if EP is enabled """ def __init__( diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 487c6e86c99d..ca18e8588798 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -250,7 +250,9 @@ def __init__( max_num_batched_tokens = vllm_config.scheduler_config.max_num_batched_tokens - # Create ExpertMapManager to handle expert mapping and placement + # Create ExpertMapManager to handle expert mapping and placement for EP. + # See ExpertMapManager for a detailed description of what it does and when + # it is required. self.expert_map_manager = ExpertMapManager( max_num_batched_tokens=max_num_batched_tokens, top_k=top_k,