feat(moe): Add MC2 communication method for MoE layers

yiz-liu · yiz-liu · commit 4d7c00772e7e · 2025-08-21T12:10:04.000+08:00
This method replaces the previous all-gather approach for small numbers of tokens.

The key changes include:
- A new `AscendFusedMoE` layer that handles token splitting, local computation, and final aggregation via all-gather.
- Logic in the model runner to dynamically select between the new MC2 method and the existing all-gather method based on the number of input tokens.
- Sharding the MoE communication mask across tensor-parallel ranks.

Signed-off-by: Yizhou Liu &lt;liu_yizhou@outlook.com&gt;
diff --git a/vllm_ascend/distributed/moe_comm_method.py b/vllm_ascend/distributed/moe_comm_method.py
@@ -3,7 +3,9 @@
 import torch
 import torch_npu
 from transformers.configuration_utils import PretrainedConfig
-from vllm.distributed.parallel_state import get_ep_group, get_tp_group
+from vllm.distributed.parallel_state import (
+    get_ep_group, get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size, get_tp_group)
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.utils import direct_register_custom_op
 
@@ -305,6 +307,11 @@ def _pre_process(
         self.topk_weights = topk_weights.to(torch.float32)
         self.mc2_mask = get_forward_context().mc2_mask
 
+        tp_size = get_tensor_model_parallel_world_size()
+        split_mc2_mask = torch.tensor_split(self.mc2_mask, tp_size, dim=0)
+        tp_rank = get_tensor_model_parallel_rank()
+        self.mc2_mask = split_mc2_mask[tp_rank]
+
         dispatch_kwargs = {
             "x": hidden_states,
             "expert_ids": self.topk_ids,
diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py
@@ -18,12 +18,16 @@
 from typing import Callable, Optional
 
 import torch
+import torch.distributed as dist
+from torch import nn
 from vllm.config import CompilationLevel, get_current_vllm_config
+from vllm.distributed import get_tp_group
 from vllm.forward_context import get_forward_context
-from vllm.model_executor.layers.fused_moe.layer import \
-    UnquantizedFusedMoEMethod
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE, UnquantizedFusedMoEMethod)
 
 from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.distributed.moe_comm_method import MC2CommImpl
 from vllm_ascend.ops.fused_moe import fused_experts_moge, unified_fused_experts
 from vllm_ascend.ops.layers.experts_selector import select_experts
 from vllm_ascend.utils import is_310p
@@ -109,5 +113,123 @@ def forward_oot(
     )
 
 
+class AscendFusedMoE(FusedMoE):
+
+    def __init__(
+        self,
+        num_experts,
+        top_k,
+        hidden_size,
+        intermediate_size,
+        params_dtype=None,
+        reduce_results=False,
+        renormalize=True,
+        use_grouped_topk=False,
+        num_expert_group=None,
+        topk_group=None,
+        quant_config=None,
+        tp_size=None,
+        ep_size=None,
+        dp_size=None,
+        prefix="",
+        custom_routing_function=None,
+        scoring_func="softmax",
+        e_score_correction_bias=None,
+        apply_router_weight_on_input=False,
+        activation="silu",
+        enable_eplb=False,
+        num_redundant_experts=0,
+        has_bias=False,
+    ):
+        super().__init__(
+            num_experts,
+            top_k,
+            hidden_size,
+            intermediate_size,
+            params_dtype,
+            reduce_results,
+            renormalize,
+            use_grouped_topk,
+            num_expert_group,
+            topk_group,
+            quant_config,
+            tp_size,
+            ep_size,
+            dp_size,
+            prefix,
+            custom_routing_function,
+            scoring_func,
+            e_score_correction_bias,
+            apply_router_weight_on_input,
+            activation,
+            enable_eplb,
+            num_redundant_experts,
+            has_bias,
+        )
+
+        self.tp_group = get_tp_group().device_group
+
+    def forward_impl(self, hidden_states: torch.Tensor,
+                     router_logits: torch.Tensor):
+        assert self.quant_method is not None
+
+        num_tokens, _ = hidden_states.shape
+        forward_context = get_forward_context()
+
+        moe_comm_method = forward_context.moe_comm_method
+        if type(moe_comm_method) is MC2CommImpl:
+            # NOTE: Pad tensors to make sure they can be evenly split.
+            if num_tokens % self.ep_size != 0:
+                pad_size = self.ep_size - (num_tokens % self.ep_size)
+                hidden_states = nn.functional.pad(hidden_states,
+                                                  (0, 0, 0, pad_size))
+                router_logits = nn.functional.pad(router_logits,
+                                                  (0, 0, 0, pad_size))
+
+            split_hidden_states = torch.tensor_split(hidden_states,
+                                                     self.ep_size,
+                                                     dim=0)
+            split_router_logits = torch.tensor_split(router_logits,
+                                                     self.ep_size,
+                                                     dim=0)
+            hidden_states = split_hidden_states[self.ep_rank]
+            router_logits = split_router_logits[self.ep_rank]
+
+        # Matrix multiply.
+        final_hidden_states = self.quant_method.apply(
+            layer=self,
+            x=hidden_states,
+            router_logits=router_logits,
+            top_k=self.top_k,
+            renormalize=self.renormalize,
+            use_grouped_topk=self.use_grouped_topk,
+            global_num_experts=self.global_num_experts,
+            expert_map=self.expert_map,
+            topk_group=self.topk_group,
+            num_expert_group=self.num_expert_group,
+            custom_routing_function=self.custom_routing_function,
+            scoring_func=self.scoring_func,
+            e_score_correction_bias=self.e_score_correction_bias,
+            activation=self.activation,
+            apply_router_weight_on_input=self.apply_router_weight_on_input,
+            enable_eplb=self.enable_eplb,
+            expert_load_view=self.expert_load_view,
+            logical_to_physical_map=self.logical_to_physical_map,
+            logical_replica_count=self.logical_replica_count,
+        )
+
+        if type(moe_comm_method) is MC2CommImpl:
+            dist.all_gather(list(split_hidden_states), hidden_states,
+                            self.tp_group)
+            final_hidden_states = torch.cat(split_hidden_states, dim=0)
+            if num_tokens % self.ep_size != 0:
+                final_hidden_states = final_hidden_states[:num_tokens]
+        elif self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
+            final_hidden_states = self.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states)
+
+        return final_hidden_states
+
+
 UnquantizedFusedMoEMethod.__init__ = unquantized_fused_moe_init_func
 UnquantizedFusedMoEMethod.forward_oot = forward_oot
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -493,6 +493,9 @@ def register_ascend_customop():
     from vllm_ascend.ops.layernorm import AscendRMSNorm
     CustomOp.register_oot(_decorated_op_cls=AscendRMSNorm, name="RMSNorm")
 
+    from vllm_ascend.ops.common_fused_moe import AscendFusedMoE
+    CustomOp.register_oot(_decorated_op_cls=AscendFusedMoE, name="FusedMoE")
+
     # NOTE: Keep this at last to ensure all custom actions are registered
     _ASCEND_CUSTOMOP_IS_REIGISTERED = True
 
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -87,6 +87,7 @@
 from vllm_ascend.compilation.acl_graph import ACLGraphWrapper
 from vllm_ascend.distributed.moe_comm_method import (AllGatherCommImpl,
                                                      DummyCommImpl,
+                                                     MC2CommImpl,
                                                      MoECommMethod)
 from vllm_ascend.multistream.ms_split import compute_split_seq_index
 from vllm_ascend.platform import NPUPlatform
@@ -360,13 +361,14 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
             self.is_kv_producer = vllm_config.kv_transfer_config.is_kv_producer
             self.is_kv_consumer = vllm_config.kv_transfer_config.is_kv_consumer
 
+        self.mc2_tokens_capacity = 512 * self.parallel_config.tensor_parallel_size
         self.reserved_mc2_mask = torch.zeros(
-            512,
+            self.mc2_tokens_capacity,
             dtype=torch.bool,
             device=self.device,
         )
 
-        self.moe_comm_method = AllGatherCommImpl
+        self.moe_comm_method = MC2CommImpl
 
     def _use_aclgraph(self) -> bool:
         return self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE and self.compilation_config.level == CompilationLevel.PIECEWISE and not self.model_config.enforce_eager
@@ -1557,6 +1559,9 @@ def execute_model(
              intermediate_tensors) = (self._prepare_inputs(
                  scheduler_output, intermediate_tensors))
 
+        moe_comm_method = (self.moe_comm_method if num_input_tokens
+                           <= self.mc2_tokens_capacity else AllGatherCommImpl)
+
         # Run forward pass
         with ProfileExecuteDuration().capture_async("forward"):
             with set_ascend_forward_context(
@@ -1566,7 +1571,7 @@ def execute_model(
                     num_tokens_across_dp=num_tokens_across_dp,
                     with_prefill=self.with_prefill,
                     reserved_mc2_mask=self.reserved_mc2_mask,
-                    moe_comm_method=self.moe_comm_method(
+                    moe_comm_method=moe_comm_method(
                         self.device, self.dtype, self.model_config.hf_config),
                     num_actual_tokens=scheduler_output.
                     total_num_scheduled_tokens):