File tree Expand file tree Collapse file tree 1 file changed +7
-5
lines changed 
examples/wide_ep/slurm_scripts Expand file tree Collapse file tree 1 file changed +7
-5
lines changed Original file line number Diff line number Diff line change @@ -11,7 +11,7 @@ workdir=<workdir>  # Path to disaggr_torch.slurm
1111model_dir=< model_dir>   #  Path to the model checkpoint
1212
1313mtp_size=0
14- ntasks_per_node=4 #  4 GPUs per GB200 node
14+ ntasks_per_node=4 #  4 GPUs per GB200 node, 8 GPUs per B200 node 
1515
1616isl=1024
1717osl=1024
@@ -22,8 +22,9 @@ streaming=true
2222for  b  in  1 64 1024;  do 
2323    for  eplb_num_slots  in  0 256 288;  do 
2424        concurrency=$(( b *  16 )) 
25-         ctx_num=$(( (concurrency +  5499 )/ 5500 )) 
26-         total_node_num=$(( ctx_num +  4 )) 
25+         ctx_node_num=$(( (concurrency +  5499 )/ 5500 )) #  $(((concurrency + 10999)/11000)) for B200
26+         ctx_num=${ctx_node_num}  #  $((ctx_node_num * 2)) for B200
27+         total_node_num=$(( ctx_node_num +  4 )) #  $((ctx_node_num + 2)) for B200
2728        ntasks=$(( total_node_num *  ntasks_per_node)) 
2829
2930        args=(
5657#  dep32 eplb288
5758for  b  in  512;  do 
5859    concurrency=$(( b *  32 )) 
59-     ctx_num=$(( (concurrency +  5499 )/ 5500 )) 
60-     total_node_num=$(( ctx_num +  8 )) 
60+     ctx_node_num=$(( (concurrency +  5499 )/ 5500 )) #  $(((concurrency + 10999)/11000)) for B200
61+     ctx_num=${ctx_node_num}  #  $((ctx_node_num * 2)) for B200
62+     total_node_num=$(( ctx_node_num +  8 )) #  $((ctx_node_num + 4)) for B200
6163    ntasks=$(( total_node_num *  ntasks_per_node)) 
6264    eplb_num_slots=288
6365
 
 
   
 
     
   
   
          
    
    
     
    
      
     
     
    You can’t perform that action at this time.
  
 
    
  
    
      
        
     
       
      
     
   
 
    
    
  
 
  
 
     
    
0 commit comments