@@ -32,7 +32,7 @@ class AlltoallMethodType(IntEnum):
3232 NotEnabled = 0
3333 # MNNVL
3434 MNNVL = 1
35- # DeepEP intranode or internode: no CUDA Graphs support , IBGDA is required by internode
35+ # DeepEP intranode or internode: CUDA Graphs are supported , IBGDA is required by internode
3636 DeepEP = 2
3737 # DeepEP low latency: CUDA Graphs are supported, IBGDA is required
3838 DeepEPLowLatency = 3
@@ -101,6 +101,8 @@ def __init__(
101101 self .repeat_idx = 0
102102 self .repeat_count = 1
103103
104+ self .use_cuda_graph = model_config .use_cuda_graph
105+
104106 moe_load_balancer_config = model_config .moe_load_balancer
105107 init_expert_size_per_partition = moe_load_balancer_config .num_local_slots if moe_load_balancer_config else self .num_experts // self .ep_size
106108 self .initial_global_assignments = [
@@ -212,6 +214,9 @@ def __init__(
212214 str (
213215 min (model_config .max_num_tokens ,
214216 self .moe_max_num_tokens ))))
217+ # Set nvshmem queue pair depth larger than the number of on-flight WRs (ref: https://github.com/deepseek-ai/DeepEP/issues/427)
218+ os .environ ['NVSHMEM_QP_DEPTH' ] = str (
219+ 2 * (self .deep_ep_max_num_tokens + 1 ))
215220 self .deep_ep_buffer = buffer_pool .get_low_latency_buffer (
216221 model_config .mapping )
217222 self .deep_ep_buffer .reserve (self .deep_ep_max_num_tokens ,
@@ -255,6 +260,25 @@ def _check_configs(self):
255260 def select_alltoall_method_type (mapping : Mapping , top_k : int ,
256261 dtype : torch .dtype ,
257262 use_cuda_graph : bool ) -> AlltoallMethodType :
263+
264+ # Check if DeepEP is feasible for the given number of ranks
265+ # DeepEP supports two modes:
266+ # 1. Intranode: Single node with 2, 4, or 8 ranks
267+ # 2. Internode: 2, 4, 8, or 16 nodes with 8 ranks per node
268+ def is_deepep_feasible (num_ranks : int ) -> bool :
269+ NUM_INTRANODE_SUPPORTED_RANKS = {2 , 4 , 8 }
270+ REQUIRED_LOCAL_MPI_SIZE = 8
271+ NUM_INTERNODE_SUPPORTED_RDMA_RANKS = {2 , 4 , 8 , 16 }
272+ mpi_size = local_mpi_size ()
273+ # Intranode cases
274+ if num_ranks == mpi_size and num_ranks in NUM_INTRANODE_SUPPORTED_RANKS :
275+ return True
276+ # Internode cases
277+ if mpi_size != REQUIRED_LOCAL_MPI_SIZE :
278+ return False
279+ num_rdma_nodes = num_ranks // mpi_size
280+ return num_rdma_nodes in NUM_INTERNODE_SUPPORTED_RDMA_RANKS
281+
258282 all2all_method_type = os .environ .get ("TRTLLM_FORCE_ALLTOALL_METHOD" )
259283 if all2all_method_type is not None :
260284 return AlltoallMethodType [all2all_method_type ]
@@ -276,12 +300,10 @@ def select_alltoall_method_type(mapping: Mapping, top_k: int,
276300
277301 if os .environ .get ("TRTLLM_CAN_USE_DEEP_EP" , "0" ) == "1" :
278302 if deep_ep_installed and dtype == torch .bfloat16 :
279- if use_cuda_graph :
280- # Here we can only choose DeepEPLowLatency since only this method supports CUDA Graphs.
281- return AlltoallMethodType .DeepEPLowLatency
282- else :
283- # Here we can choose DeepEP or DeepEPLowLatency if both are available. Now DeepEP is faster.
303+ # Choose DeepEP if feasible
304+ if is_deepep_feasible (mapping .moe_ep_size ):
284305 return AlltoallMethodType .DeepEP
306+ return AlltoallMethodType .DeepEPLowLatency
285307
286308 return AlltoallMethodType .NotEnabled
287309
@@ -548,7 +570,7 @@ def forward_chunk(
548570 if not use_postquant_alltoall :
549571 x , recv_topk_idx , token_final_scales , num_recv_tokens_per_expert_list , deep_ep_handle = \
550572 self .deep_ep_buffer .dispatch (x , token_selected_slots , token_final_scales , self .num_slots ,
551- self .expert_size_per_partition * self .mapping .moe_ep_rank )
573+ self .expert_size_per_partition * self .mapping .moe_ep_rank , all_rank_max_num_tokens , self . ep_size , self . use_cuda_graph )
552574 padded , x , _ , token_selected_slots , token_final_scales = self .pad_empty_recv_tensors (
553575 x , None , recv_topk_idx , token_final_scales )
554576 elif self .alltoall_method_type == AlltoallMethodType .DeepEPLowLatency :
@@ -636,7 +658,7 @@ def forward_chunk(
636658 x_sf = x_sf .view (torch .float32 )
637659 (x , x_sf ), recv_topk_idx , token_final_scales , num_recv_tokens_per_expert_list , deep_ep_handle = \
638660 self .deep_ep_buffer .dispatch ((x , x_sf ), token_selected_slots , token_final_scales , self .num_slots ,
639- self .expert_size_per_partition * self .mapping .moe_ep_rank )
661+ self .expert_size_per_partition * self .mapping .moe_ep_rank , all_rank_max_num_tokens , self . ep_size , self . use_cuda_graph )
640662 padded , x , x_sf , token_selected_slots , token_final_scales = self .pad_empty_recv_tensors (
641663 x , x_sf , recv_topk_idx , token_final_scales )
642664 if x_sf is not None :
0 commit comments