fix refactor issues

dongxuy04 · dongxuy04 · commit 02bf032a1c02 · 2025-08-17T15:16:31.000+08:00
Signed-off-by: Dongxu Yang &lt;78518666+dongxuy04@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py
@@ -179,58 +179,6 @@ def _(
         return (input.new_empty(output_shape, dtype=torch.uint8),
                 global_scale.new_empty(scale_shape, dtype=torch.uint8))
 
-    @torch.library.register_fake("trtllm::moe_comm_prepare_indices")
-    def _(
-        gathered_target_rank_ids: torch.Tensor,
-        real_rank_token_count_cum_sum: Optional[torch.Tensor],
-        max_token_count_per_rank: int,
-        expert_count: int,
-        top_k: int,
-        ep_rank: int,
-        ep_size: int,
-    ):
-        max_send_ranks_per_token = max(ep_size, top_k)
-        local_gather_indices_shape = (max_token_count_per_rank * ep_size, )
-        rank_count_cum_sum_shape = (ep_size, )
-        send_rank_local_indices_shape = (max_token_count_per_rank *
-                                         max_send_ranks_per_token, )
-        recv_rank_local_indices_shape = (max_token_count_per_rank * ep_size, )
-        backward_recv_rank_local_indices_shape = (max_token_count_per_rank *
-                                                  max_send_ranks_per_token, )
-
-        local_gather_indices = gathered_target_rank_ids.new_empty(
-            local_gather_indices_shape, dtype=torch.int32)
-        send_rank_count_cum_sum = gathered_target_rank_ids.new_empty(
-            rank_count_cum_sum_shape, dtype=torch.int32)
-        send_rank_local_indices = gathered_target_rank_ids.new_empty(
-            send_rank_local_indices_shape, dtype=torch.int32)
-        recv_rank_count_cum_sum = gathered_target_rank_ids.new_empty(
-            rank_count_cum_sum_shape, dtype=torch.int32)
-        recv_rank_local_indices = gathered_target_rank_ids.new_empty(
-            recv_rank_local_indices_shape, dtype=torch.int32)
-        backward_recv_rank_local_indices = gathered_target_rank_ids.new_empty(
-            backward_recv_rank_local_indices_shape, dtype=torch.int32)
-
-        return (local_gather_indices, send_rank_count_cum_sum,
-                send_rank_local_indices, recv_rank_count_cum_sum,
-                recv_rank_local_indices, backward_recv_rank_local_indices)
-
-    @torch.library.register_fake("trtllm::moe_local_gather")
-    def _(
-        recv_rank_cum_sum: torch.Tensor,
-        local_gather_indices: torch.Tensor,
-        gathered_expert_ids: torch.Tensor,
-        gathered_scales: Optional[torch.Tensor],
-        local_expert_ids: torch.Tensor,
-        local_scales: Optional[torch.Tensor],
-        max_token_count_per_rank: int,
-        expert_count: int,
-        top_k: int,
-        ep_rank: int,
-        ep_size: int,
-    ):
-        pass
-
     @torch.library.register_fake("trtllm::moe_comm")
     def _(
         input: torch.Tensor,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@@ -192,7 +192,7 @@ def __init__(
             self.use_low_precision_combine = (os.environ.get(
                 "TRTLLM_MOE_USE_LOW_PRECISION_COMBINE", "0")
                                               == "1") and qm.has_nvfp4()
-            
+
             if self.alltoall_method_type == AlltoallMethodType.MNNVL:
                 MnnvlMemory.initialize()
                 self.alltoall_workspace = MnnvlMoe.get_moe_workspaces(
@@ -296,6 +296,9 @@ def calculate_num_chunks(self, all_rank_num_tokens: List[int]) -> int:
                 1) // self.moe_max_num_tokens
 
     def can_use_alltoall(self, all_rank_num_tokens, all_rank_max_num_tokens):
+        if self.alltoall_method_type == AlltoallMethodType.MNNVL:
+            return True
+
         # Disable alltoall when chunking is used
         if self.calculate_num_chunks(all_rank_num_tokens) > 1:
             return False
@@ -453,12 +456,12 @@ def forward_chunk(
         else:
             tuner_num_tokens = None
             tuner_top_k = None
+        alltoall_info = None
         if use_all_to_all:
             if self.alltoall_method_type == AlltoallMethodType.MNNVL:
                 if self.enable_dummy_allreduce:
                     self.dummy_allreduce()
                 token_count = x.shape[0]
-                alltoall_info = None
                 if is_last_call and self.layer_load_balancer is not None and not self.layer_load_balancer.is_static_routing(
                 ):
                     loadbalancer_local_statistic_info = self.layer_load_balancer.get_local_statistic_tensor(
@@ -469,7 +472,7 @@ def forward_chunk(
                     self.alltoall_prepare(all_rank_max_num_tokens,
                                           token_selected_slots,
                                           loadbalancer_local_statistic_info)
-                                          
+
                 if gathered_loadbalancer_local_statistic_info is not None:
                     gathered_loadbalancer_local_statistic_info = gathered_loadbalancer_local_statistic_info.view(
                         (self.mapping.moe_ep_size, self.num_experts))
@@ -577,10 +580,13 @@ def forward_chunk(
         if self.alltoall_method_type == AlltoallMethodType.MNNVL:
             top_k = self.routing_method.experts_per_token
             x, x_sf, token_selected_slots, token_final_scales = self.alltoall_dispatch(
-                x, x_sf, token_selected_slots, token_final_scales, all_rank_max_num_tokens, top_k, alltoall_info)
+                x, x_sf, token_selected_slots, token_final_scales,
+                all_rank_max_num_tokens, top_k, alltoall_info)
 
         if use_postquant_alltoall:
-            if self.alltoall_method_type == AlltoallMethodType.DeepEP:
+            if self.alltoall_method_type == AlltoallMethodType.MNNVL:
+                pass
+            elif self.alltoall_method_type == AlltoallMethodType.DeepEP:
                 if x_sf is not None:
                     # Adapter between `x_sf` and DeepEP
                     # TODO: remove the adapter by adding dtype support to DeepEP
@@ -858,34 +864,32 @@ def split_chunk(split_token_num: int, split_num_chunks: int):
         self.repeat_idx = 0 if self.repeat_idx == self.repeat_count - 1 else self.repeat_idx + 1
         return outputs
 
-    def alltoall_prepare(
-            self, all_rank_max_num_tokens: int,
-            token_selected_slots: torch.Tensor,
-            local_statistic_tensor: Optional[torch.Tensor]):
+    def alltoall_prepare(self, all_rank_max_num_tokens: int,
+                         token_selected_slots: torch.Tensor,
+                         local_statistic_tensor: Optional[torch.Tensor]):
         top_k = self.routing_method.experts_per_token
 
         alltoall_info, gathered_local_statistic_tensor = MnnvlMoe.mnnvl_moe_alltoallv_prepare_without_allgather(
-            token_selected_slots,
-            local_statistic_tensor, self.alltoall_prepare_workspace,
-            all_rank_max_num_tokens, self.ep_rank, self.ep_size,
-            self.num_experts, self.num_slots, top_k)
+            token_selected_slots, local_statistic_tensor,
+            self.alltoall_prepare_workspace, all_rank_max_num_tokens,
+            self.ep_rank, self.ep_size, self.num_experts, self.num_slots, top_k)
 
         return token_selected_slots, gathered_local_statistic_tensor, alltoall_info
 
     def alltoall_dispatch(self, x: torch.Tensor, x_sf: Optional[torch.Tensor],
-                            token_selected_slots: torch.Tensor, 
-                            token_final_scales: Optional[torch.Tensor],
-                            all_rank_max_num_tokens: int,
-                            top_k: int,
-                            alltoall_info: MoEAlltoallInfo):
-        
-        x, x_sf, token_selected_slots, token_final_scales = MnnvlMoe.mnnvl_moe_alltoallv([x, x_sf, token_selected_slots, token_final_scales], alltoall_info,
-                                         self.alltoall_workspace, self.ep_rank,
-                                         self.ep_size)
-        
-        torch.ops.trtllm.memset_expert_ids(
-                    token_selected_slots, alltoall_info.recv_rank_count_cumsum,
-                    all_rank_max_num_tokens, top_k, self.num_slots, self.ep_size)
+                          token_selected_slots: torch.Tensor,
+                          token_final_scales: Optional[torch.Tensor],
+                          all_rank_max_num_tokens: int, top_k: int,
+                          alltoall_info: MoEAlltoallInfo):
+
+        x, x_sf, token_selected_slots, token_final_scales = MnnvlMoe.mnnvl_moe_alltoallv(
+            [x, x_sf, token_selected_slots, token_final_scales], alltoall_info,
+            self.alltoall_workspace, self.ep_rank, self.ep_size)
+
+        torch.ops.trtllm.memset_expert_ids(token_selected_slots,
+                                           alltoall_info.recv_rank_count_cumsum,
+                                           all_rank_max_num_tokens, top_k,
+                                           self.num_slots, self.ep_size)
 
         return x, x_sf, token_selected_slots, token_final_scales