sgl-project · zhyncs · Jun 20, 2025 · May 18, 2025 · May 18, 2025 · May 18, 2025
@@ -71,6 +71,8 @@
     configure_logger,
     get_bool_env_var,
     kill_process_tree,
+    require_mlp_sync,
+    require_mlp_tp_gather,
     set_gpu_proc_affinity,
     suppress_other_loggers,
 )
@@ -243,7 +245,7 @@ def extend(reqs, model_runner):
         enable_custom_logit_processor=False,
     )
     batch.prepare_for_extend()
-    _maybe_prepare_dp_attn_batch(batch, model_runner)
+    _maybe_prepare_mlp_sync_batch(batch, model_runner)
     model_worker_batch = batch.get_model_worker_batch()
     forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
     logits_output, _ = model_runner.forward(forward_batch)
@@ -255,26 +257,26 @@ def extend(reqs, model_runner):
 def decode(input_token_ids, batch, model_runner):
     batch.output_ids = input_token_ids
     batch.prepare_for_decode()
-    _maybe_prepare_dp_attn_batch(batch, model_runner)
+    _maybe_prepare_mlp_sync_batch(batch, model_runner)
     model_worker_batch = batch.get_model_worker_batch()
     forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
     logits_output, _ = model_runner.forward(forward_batch)
     next_token_ids = model_runner.sample(logits_output, forward_batch)
     return next_token_ids, logits_output.next_token_logits
 
 
-def _maybe_prepare_dp_attn_batch(batch: ScheduleBatch, model_runner):
-    if model_runner.server_args.enable_dp_attention:
-        Scheduler.prepare_dp_attn_batch_raw(
+def _maybe_prepare_mlp_sync_batch(batch: ScheduleBatch, model_runner):
+    if require_mlp_sync(model_runner.server_args):
+        Scheduler.prepare_mlp_sync_batch_raw(
             batch,
             dp_size=model_runner.server_args.dp_size,
             attn_tp_size=1,
-            moe_dense_tp_size=model_runner.server_args.moe_dense_tp_size,
             tp_cpu_group=model_runner.tp_group.cpu_group,
             get_idle_batch=None,
             disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
             spec_algorithm=SpeculativeAlgorithm.NONE,
             speculative_num_draft_tokens=None,
+            require_mlp_tp_gather=require_mlp_tp_gather(model_runner.server_args),
         )
 
 

@@ -54,6 +54,7 @@
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardMode
 from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
+from sglang.srt.utils import require_mlp_sync
 
 logger = logging.getLogger(__name__)
 
@@ -645,10 +646,7 @@ def event_loop_normal_disagg_decode(self: Scheduler):
             batch = self.get_next_disagg_decode_batch_to_run()
             self.cur_batch = batch
 
-            prepare_dp_attn_flag = (
-                self.server_args.enable_dp_attention
-                or self.server_args.enable_sp_layernorm
-            )
+            prepare_mlp_sync_flag = require_mlp_sync(self.server_args)
 
             if batch:
                 # Generate fake extend output.
@@ -657,14 +655,14 @@ def event_loop_normal_disagg_decode(self: Scheduler):
                     self.stream_output(
                         batch.reqs, any(req.return_logprob for req in batch.reqs)
                     )
-                    if prepare_dp_attn_flag:
+                    if prepare_mlp_sync_flag:
                         self._prepare_idle_batch_and_run(None)
                 else:
-                    if prepare_dp_attn_flag:
-                        self.prepare_dp_attn_batch(batch)
+                    if prepare_mlp_sync_flag:
+                        self.prepare_mlp_sync_batch(batch)
                     result = self.run_batch(batch)
                     self.process_batch_result(batch, result)
-            elif prepare_dp_attn_flag:
+            elif prepare_mlp_sync_flag:
                 batch, _ = self._prepare_idle_batch_and_run(None)
 
             if batch is None and (
@@ -695,10 +693,7 @@ def event_loop_overlap_disagg_decode(self: Scheduler):
             self.cur_batch = batch
             last_batch_in_queue = False
 
-            prepare_dp_attn_flag = (
-                self.server_args.enable_dp_attention
-                or self.server_args.enable_sp_layernorm
-            )
+            prepare_mlp_sync_flag = require_mlp_sync(self.server_args)
 
             if batch:
                 # Generate fake extend output.
@@ -707,16 +702,16 @@ def event_loop_overlap_disagg_decode(self: Scheduler):
                     self.stream_output(
                         batch.reqs, any(req.return_logprob for req in batch.reqs)
                     )
-                    if prepare_dp_attn_flag:
+                    if prepare_mlp_sync_flag:
                         batch_, result = self._prepare_idle_batch_and_run(
                             None, delay_process=True
                         )
                         if batch_:
                             result_queue.append((batch_.copy(), result))
                             last_batch_in_queue = True
                 else:
-                    if prepare_dp_attn_flag:
-                        self.prepare_dp_attn_batch(batch)
+                    if prepare_mlp_sync_flag:
+                        self.prepare_mlp_sync_batch(batch)
                     result = self.run_batch(batch)
                     result_queue.append((batch.copy(), result))
 
@@ -731,7 +726,7 @@ def event_loop_overlap_disagg_decode(self: Scheduler):
                         self.set_next_batch_sampling_info_done(tmp_batch)
                     last_batch_in_queue = True
 
-            elif prepare_dp_attn_flag:
+            elif prepare_mlp_sync_flag:
                 batch, result = self._prepare_idle_batch_and_run(
                     None, delay_process=True
                 )
@@ -761,13 +756,13 @@ def event_loop_overlap_disagg_decode(self: Scheduler):
             self.last_batch = batch
             self.last_batch_in_queue = last_batch_in_queue
 
-    def _prepare_idle_batch_and_run(self, batch, delay_process=False):
-        batch, _ = self.prepare_dp_attn_batch(batch)
+    def _prepare_idle_batch_and_run(self: Scheduler, batch, delay_process=False):
+        batch, _ = self.prepare_mlp_sync_batch(batch)
         result = None
         if batch:
             result = self.run_batch(batch)
             if not delay_process:
-                self.process_batch_result(batch, result)
+                self.prepare_mlp_sync_batch(batch, result)
         return batch, result
 
     def get_next_disagg_decode_batch_to_run(

@@ -45,6 +45,7 @@
 )
 from sglang.srt.managers.schedule_batch import FINISH_LENGTH, Req, ScheduleBatch
 from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.utils import require_mlp_sync
 
 if TYPE_CHECKING:
     from torch.distributed import ProcessGroup
@@ -274,12 +275,8 @@ def event_loop_normal_disagg_prefill(self: Scheduler) -> None:
             self.process_prefill_chunk()
             batch = self.get_new_batch_prefill()
 
-            # Handle DP attention
-            if (
-                self.server_args.enable_dp_attention
-                or self.server_args.enable_sp_layernorm
-            ):
-                batch, _ = self.prepare_dp_attn_batch(batch)
+            if require_mlp_sync(self.server_args):
+                batch, _ = self.prepare_mlp_sync_batch(batch)
             self.cur_batch = batch
 
             if batch:
@@ -312,12 +309,8 @@ def event_loop_overlap_disagg_prefill(self: Scheduler) -> None:
             self.process_prefill_chunk()
             batch = self.get_new_batch_prefill()
 
-            # Handle DP attention
-            if (
-                self.server_args.enable_dp_attention
-                or self.server_args.enable_sp_layernorm
-            ):
-                batch, _ = self.prepare_dp_attn_batch(batch)
+            if require_mlp_sync(self.server_args):
+                batch, _ = self.prepare_mlp_sync_batch(batch)
             self.cur_batch = batch
             if batch:
                 result = self.run_batch(batch)

@@ -28,9 +28,9 @@
     attn_tp_reduce_scatter,
     dp_gather_partial,
     dp_scatter,
+    get_attention_dp_size,
     get_attention_tp_rank,
     get_attention_tp_size,
-    get_local_attention_dp_size,
 )
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -229,7 +229,7 @@ class CommunicateContext:
     process_group_sizes: Dict[ScatterMode, int]
     attn_tp_rank: int
     attn_tp_size: int
-    local_attn_dp_size: int
+    attn_dp_size: int
     tp_size: int
 
     def is_same_group_size(self, a: ScatterMode, b: ScatterMode):
@@ -239,7 +239,7 @@ def is_same_group_size(self, a: ScatterMode, b: ScatterMode):
     def init_new(cls):
         attn_tp_rank = get_attention_tp_rank()
         attn_tp_size = get_attention_tp_size()
-        local_attn_dp_size = get_local_attention_dp_size()
+        attn_dp_size = get_attention_dp_size()
         tp_size = get_tensor_model_parallel_world_size()
         process_group_sizes = {
             ScatterMode.SCATTERED: 1,
@@ -251,7 +251,7 @@ def init_new(cls):
             process_group_sizes=process_group_sizes,
             attn_tp_rank=attn_tp_rank,
             attn_tp_size=attn_tp_size,
-            local_attn_dp_size=local_attn_dp_size,
+            attn_dp_size=attn_dp_size,
             tp_size=tp_size,
         )
 
@@ -385,7 +385,7 @@ def _gather_hidden_states_and_residual(
             attn_tp_all_gather(
                 list(residual.tensor_split(context.attn_tp_size)), local_residual
             )
-        if context.local_attn_dp_size != 1:
+        if context.attn_dp_size != 1:
             if context.attn_tp_rank == 0:
                 hidden_states += residual
             hidden_states, local_hidden_states = (

@@ -165,7 +165,8 @@ def disable_dp_size():
 
 
 def get_dp_local_info(forward_batch: ForwardBatch):
-    dp_rank = get_local_attention_dp_rank()
+    # `get_dp_local_info` is only called in global DP gather and scatter. We use global DP rank here.
+    dp_rank = get_attention_dp_rank()
 
     if forward_batch.dp_local_start_pos is None:
         cumtokens = torch.cumsum(forward_batch.global_num_tokens_gpu, dim=0)

@@ -30,9 +30,9 @@
     attn_tp_all_gather,
     dp_gather_replicate,
     dp_scatter,
+    get_attention_dp_rank,
     get_attention_dp_size,
     get_attention_tp_size,
-    get_local_attention_dp_rank,
     get_local_attention_dp_size,
 )
 from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
@@ -171,7 +171,7 @@ def compute_dp_attention_metadata(self, hidden_states: torch.Tensor):
             return
 
         cumtokens = torch.cumsum(self.global_num_tokens_for_logprob_gpu, dim=0)
-        dp_rank = get_local_attention_dp_rank()
+        dp_rank = get_attention_dp_rank()
         if dp_rank == 0:
             dp_local_start_pos = torch.zeros_like(
                 self.global_num_tokens_for_logprob_gpu[0]

@@ -148,6 +148,8 @@
     kill_itself_when_parent_died,
     point_to_point_pyobj,
     pyspy_dump_schedulers,
+    require_mlp_sync,
+    require_mlp_tp_gather,
     set_gpu_proc_affinity,
     set_random_seed,
     suppress_other_loggers,
@@ -1434,9 +1436,8 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
             else:
                 ret = None
 
-        # Handle DP attention
-        if self.server_args.enable_dp_attention or self.server_args.enable_sp_layernorm:
-            ret, _ = self.prepare_dp_attn_batch(ret)
+        if require_mlp_sync(self.server_args):
+            ret, _ = self.prepare_mlp_sync_batch(ret)
 
         return ret
 
@@ -1746,12 +1747,11 @@ def process_batch_result(
             self.return_health_check_ct -= 1
             self.send_to_tokenizer.send_pyobj(HealthCheckOutput())
 
-    def prepare_dp_attn_batch(self, local_batch: ScheduleBatch):
-        return self.prepare_dp_attn_batch_raw(
+    def prepare_mlp_sync_batch(self, local_batch: ScheduleBatch):
+        return self.prepare_mlp_sync_batch_raw(
             local_batch,
             dp_size=self.server_args.dp_size,
             attn_tp_size=self.attn_tp_size,
-            moe_dense_tp_size=self.server_args.moe_dense_tp_size,
             tp_cpu_group=self.tp_cpu_group,
             get_idle_batch=self.get_idle_batch,
             disable_cuda_graph=self.server_args.disable_cuda_graph,
@@ -1760,14 +1760,14 @@ def prepare_dp_attn_batch(self, local_batch: ScheduleBatch):
             enable_two_batch_overlap=self.server_args.enable_two_batch_overlap,
             enable_deepep_moe=self.server_args.enable_deepep_moe,
             deepep_mode=DeepEPMode[self.server_args.deepep_mode],
+            require_mlp_tp_gather=require_mlp_tp_gather(self.server_args),
         )
 
     @staticmethod
-    def prepare_dp_attn_batch_raw(
+    def prepare_mlp_sync_batch_raw(
         local_batch: ScheduleBatch,
         dp_size,
         attn_tp_size: int,
-        moe_dense_tp_size: Optional[int],
         tp_cpu_group,
         get_idle_batch,
         disable_cuda_graph: bool,
@@ -1776,6 +1776,7 @@ def prepare_dp_attn_batch_raw(
         enable_two_batch_overlap: bool,
         enable_deepep_moe: bool,
         deepep_mode: DeepEPMode,
+        require_mlp_tp_gather: bool,
     ):
         # Check if other DP workers have running batches
         if local_batch is None:
@@ -1850,7 +1851,7 @@ def prepare_dp_attn_batch_raw(
 
         if local_batch is not None:
             # TODO: handle the case when moe_dense_tp_size != 1
-            if moe_dense_tp_size == 1 and global_server_args_dict["enable_dp_lm_head"]:
+            if not require_mlp_tp_gather:
                 local_batch.global_num_tokens = [num_tokens]
                 local_batch.global_num_tokens_for_logprob = [num_tokens_for_logprob]
             else: