sgl-project · ch-wan · Nov 15, 2025 · Sep 12, 2025 · Nov 4, 2025 · Nov 10, 2025
diff --git a/docs/advanced_features/server_arguments.md b/docs/advanced_features/server_arguments.md
@@ -394,6 +394,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
 | `--enable-return-hidden-states` | Enable returning hidden states with responses. | `False` | bool flag (set to enable) |
 | `--scheduler-recv-interval` | The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this. | `1` | Type: int |
 | `--numa-node` | Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess. | `None` | List[int] |
+| `--enable-attn-tp-input-scattered` | Allow input of attention to be scattered when only using tensor parallelism, to reduce the computational load of operations such as qkv latent.                                                                                                      | `False`  | bool flag (set to enable) |
 
 ## Debug tensor dumps
 | Argument | Description | Defaults | Options |

@@ -11,15 +11,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
+import logging
+from contextlib import contextmanager
 from dataclasses import dataclass
 from enum import Enum, auto
 from functools import partial
-from typing import Dict, List, Optional
+from typing import Callable, Dict, List, Optional, Tuple
 
 import torch
 
 from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     get_tp_group,
     tensor_model_parallel_all_reduce,
@@ -59,9 +61,10 @@
     prepare_weight_cache,
 )
 
+_is_cuda = is_cuda()
 _is_flashinfer_available = is_flashinfer_available()
-_is_sm90_supported = is_cuda() and is_sm90_supported()
-_is_sm100_supported = is_cuda() and is_sm100_supported()
+_is_sm90_supported = _is_cuda and is_sm90_supported()
+_is_sm100_supported = _is_cuda and is_sm100_supported()
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and is_hip()
 _is_gfx95_supported = is_gfx95_supported()
 
@@ -92,6 +95,119 @@ def model_input_output():
         return ScatterMode.TP_ATTN_FULL
 
 
+class AttentionInputs:
+
+    def __init__(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        qkv_latent_func: Callable,
+    ):
+        self.hidden_states_local = hidden_states
+        self.forward_batch = forward_batch
+        self.qkv_latent_func = qkv_latent_func
+        self.hidden_states_ = None
+        self.qkv_latent_ = None
+
+    def tp_all_gather_hidden_states(self, hidden_states, forward_batch):
+        total_tokens = forward_batch.input_ids.shape[0]
+        output = hidden_states.new_empty((total_tokens, hidden_states.shape[-1]))
+        get_tp_group().all_gather_into_tensor(output, hidden_states)
+        return output
+
+    def fetch_qkv_latent(self):
+        if self.qkv_latent_ is not None:
+            return self.qkv_latent_
+        assert self.qkv_latent_func is not None
+        self.qkv_latent_ = self.qkv_latent_func(
+            self.hidden_states_local, self.forward_batch
+        )
+        if get_attn_tp_context().input_scattered:
+            self.qkv_latent_ = self.tp_all_gather_hidden_states(
+                self.qkv_latent_, self.forward_batch
+            )
+        return self.qkv_latent_
+
+    def fetch_hidden_states(self):
+        if self.hidden_states_ is not None:
+            return self.hidden_states_
+        self.hidden_states_ = self.hidden_states_local
+        if get_attn_tp_context().input_scattered:
+            self.hidden_states_ = self.tp_all_gather_hidden_states(
+                self.hidden_states_, self.forward_batch
+            )
+        return self.hidden_states_
+
+
+class AttnTpContext:
+    def __init__(self):
+        self.allow_input_scattered = False
+        self.input_scattered_ = False
+        self.attn_inputs_: Optional[AttentionInputs] = None
+
+    def init_context(self, q_lora_rank, is_nsa):
+        self.allow_input_scattered = (
+            get_global_server_args().enable_attn_tp_input_scattered
+            and _is_cuda
+            and q_lora_rank is not None
+            and not is_nsa
+            and get_tensor_model_parallel_world_size() > 1
+            and not is_dp_attention_enabled()
+            and get_moe_a2a_backend().is_none()
+            and not enable_moe_dense_fully_dp()
+            and not get_global_server_args().enable_piecewise_cuda_graph
+            and get_global_server_args().speculative_algorithm != "EAGLE3"
+        )
+        if get_global_server_args().enable_attn_tp_input_scattered:
+            if not self.allow_input_scattered:
+                logging.info(
+                    "attn_tp_input_scattered is not enabled while other conditions are not met"
+                )
+            else:
+                logging.info("attn_tp_input_scattered is enabled")
+
+    def use_input_scattered(self, forward_batch: ForwardBatch):
+        return (
+            self.allow_input_scattered
+            and forward_batch.forward_mode.is_extend()
+            and not forward_batch.forward_mode.is_target_verify()
+            and not forward_batch.forward_mode.is_draft_extend()
+            and forward_batch.input_ids is not None
+            and not forward_batch.can_run_tbo
+        )
+
+    @property
+    def input_scattered(self):
+        return self.input_scattered_
+
+    def set_attn_inputs(self, attn_inputs: AttentionInputs):
+        self.attn_inputs_ = attn_inputs
+
+    def fetch_qkv_latent(self):
+        assert self.attn_inputs_ is not None
+        return self.attn_inputs_.fetch_qkv_latent()
+
+    def fetch_hidden_states(self):
+        assert self.attn_inputs_ is not None
+        return self.attn_inputs_.fetch_hidden_states()
+
+    @contextmanager
+    def maybe_input_scattered(self, forward_batch: ForwardBatch):
+        flag = self.use_input_scattered(forward_batch)
+        old_flag = self.input_scattered
+        self.input_scattered_ = flag
+        yield
+        self.input_scattered_ = old_flag
+        self.attn_inputs_ = None
+
+
+ATTN_TP_CONTEXT = AttnTpContext()
+
+
+def get_attn_tp_context():
+    return ATTN_TP_CONTEXT
+
+
 @dataclass
 class _LayerModeComputationContext:
     num_layers: int
@@ -188,12 +304,14 @@ def __init__(
         # Reduce scatter requires skipping all-reduce in model code after MoE/MLP, so only enable for models which have that implemented. Remove flag once done for all models that use LayerCommunicator.
         allow_reduce_scatter: bool = False,
         is_last_layer: bool = False,
+        qkv_latent_func: Optional[Callable] = None,
     ):
         self.layer_scatter_modes = layer_scatter_modes
         self.input_layernorm = input_layernorm
         self.post_attention_layernorm = post_attention_layernorm
         self.allow_reduce_scatter = allow_reduce_scatter
         self.is_last_layer = is_last_layer
+        self.qkv_latent_func = qkv_latent_func
 
         self._context = CommunicateContext.init_new()
         self._communicate_simple_fn = CommunicateSimpleFn.get_fn(
@@ -252,6 +370,11 @@ def prepare_attn(
         forward_batch: ForwardBatch,
         quant_format: str = "",
     ):
+        if get_attn_tp_context().input_scattered:
+            hidden_states, residual = self._tp_reduce_scatter(
+                hidden_states,
+                residual,
+            )
         if hidden_states.shape[0] == 0:
             residual = hidden_states
         else:
@@ -335,9 +458,32 @@ def prepare_attn(
             forward_batch=forward_batch,
             context=self._context,
         )
-
+        if self.qkv_latent_func is not None:
+            attn_inputs = AttentionInputs(
+                hidden_states, forward_batch, self.qkv_latent_func
+            )
+            get_attn_tp_context().set_attn_inputs(attn_inputs)
         return hidden_states, residual
 
+    def _tp_reduce_scatter(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if hidden_states.shape[0] == 0:
+            return hidden_states, hidden_states
+        assert (
+            hidden_states.shape[0] % self._context.tp_size == 0
+        ), f"Expected total tokens {hidden_states.shape[0]} % tp_size {self._context.tp_size} to be 0"
+        local_tokens = hidden_states.shape[0] // self._context.tp_size
+        output = hidden_states.new_empty(local_tokens, *hidden_states.shape[1:])
+        get_tp_group().reduce_scatter_tensor(output, hidden_states)
+        if residual is not None:
+            residual = residual.tensor_split(self._context.tp_size)[
+                self._context.tp_rank
+            ]
+        return output, residual
+
-    def _tp_reduce_scatter(
-        self,
-        hidden_states: torch.Tensor,
-        residual: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        if hidden_states.shape[0] == 0:
-            return hidden_states, hidden_states
-
-        inputs = list(hidden_states.tensor_split(self._context.tp_size))
-        scattered_local_tokens = inputs[self._context.tp_rank]
-        hidden_states = get_tp_group().reduce_scatter(scattered_local_tokens, inputs)
-
-        if residual is not None:
-            residual = residual.tensor_split(self._context.tp_size)[
-                self._context.tp_rank
-            ]
-        return hidden_states, residual
+    def _tp_reduce_scatter(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        total = hidden_states.shape[0]
+        if total == 0:
+            return hidden_states, hidden_states
+
+        tp_size = self._context.tp_size
+        rank = self._context.tp_rank
+        # Equal-size path
+        if total % tp_size == 0:
+            chunk = total // tp_size
+            inputs = list(hidden_states.split(chunk, dim=0))
+            out = torch.empty_like(inputs[rank])
+            get_tp_group().reduce_scatter(out, inputs)
+            hidden_states = out
+            if residual is not None:
+                residual = residual.split(chunk, dim=0)[rank]
+            return hidden_states, residual
+
+        # Fallback: pad to equal chunks, then slice local
+        max_chunk = (total + tp_size - 1) // tp_size
+        pad = max_chunk * tp_size - total
+        if pad:
+            pad_shape = (pad,) + hidden_states.shape[1:]
+            hidden_states_padded = torch.cat(
+                [hidden_states, hidden_states.new_zeros(pad_shape)], dim=0
+            )
+        else:
+            hidden_states_padded = hidden_states
+        inputs = list(hidden_states_padded.split(max_chunk, dim=0))
+        out = torch.empty_like(inputs[rank])
+        get_tp_group().reduce_scatter(out, inputs)
+        local_len = total // tp_size + (1 if rank < (total % tp_size) else 0)
+        hidden_states = out[:local_len]
+        if residual is not None:
+            residual = residual.tensor_split(tp_size)[rank]
+        return hidden_states, residual
-    def _tp_reduce_scatter(
-        self,
-        hidden_states: torch.Tensor,
-        residual: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        if hidden_states.shape[0] == 0:
-            return hidden_states, hidden_states
-
-        inputs = list(hidden_states.tensor_split(self._context.tp_size))
-        scattered_local_tokens = inputs[self._context.tp_rank]
-        hidden_states = get_tp_group().reduce_scatter(scattered_local_tokens, inputs)
-
-        if residual is not None:
-            residual = residual.tensor_split(self._context.tp_size)[
-                self._context.tp_rank
-            ]
-        return hidden_states, residual
+    def _tp_reduce_scatter(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        total = hidden_states.shape[0]
+        if total == 0:
+            return hidden_states, hidden_states
+
+        tp_size = self._context.tp_size
+        rank = self._context.tp_rank
+        # Equal-size path
+        if total % tp_size == 0:
+            chunk = total // tp_size
+            inputs = list(hidden_states.split(chunk, dim=0))
+            out = torch.empty_like(inputs[rank])
+            get_tp_group().reduce_scatter(out, inputs)
+            hidden_states = out
+            if residual is not None:
+                residual = residual.split(chunk, dim=0)[rank]
+            return hidden_states, residual
+
+        # Fallback: pad to equal chunks, then slice local
+        max_chunk = (total + tp_size - 1) // tp_size
+        pad = max_chunk * tp_size - total
+        if pad:
+            pad_shape = (pad,) + hidden_states.shape[1:]
+            hidden_states_padded = torch.cat(
+                [hidden_states, hidden_states.new_zeros(pad_shape)], dim=0
+            )
+        else:
+            hidden_states_padded = hidden_states
+        inputs = list(hidden_states_padded.split(max_chunk, dim=0))
+        out = torch.empty_like(inputs[rank])
+        get_tp_group().reduce_scatter(out, inputs)
+        local_len = total // tp_size + (1 if rank < (total % tp_size) else 0)
+        hidden_states = out[:local_len]
+        if residual is not None:
+            residual = residual.tensor_split(tp_size)[rank]
+        return hidden_states, residual
     def prepare_mlp(
         self,
         hidden_states: torch.Tensor,
@@ -371,12 +517,17 @@ def postprocess_layer(
         )
 
     def should_use_reduce_scatter(self, forward_batch: ForwardBatch):
-        return (
-            self.allow_reduce_scatter
-            and self._communicate_summable_tensor_pair_fn
+        if not self.allow_reduce_scatter:
+            return False
+        if (
+            self._communicate_summable_tensor_pair_fn
             is CommunicateSummableTensorPairFn._scatter_hidden_states
             and forward_batch.dp_padding_mode.is_max_len()
-        )
+        ):
+            return True
+        if get_attn_tp_context().input_scattered and not self.is_last_layer:
+            return True
+        return False
 
     def should_fuse_mlp_allreduce_with_next_layer(
         self, forward_batch: ForwardBatch
@@ -388,6 +539,9 @@ def should_fuse_mlp_allreduce_with_next_layer(
         ):
             return False
 
+        if get_attn_tp_context().input_scattered:
+            return False
+
         batch_size = (
             forward_batch.input_ids.shape[0]
             if hasattr(forward_batch, "input_ids")
@@ -422,6 +576,7 @@ class CommunicateContext:
     attn_dp_size: int
     tp_size: int
     cache = None
+    tp_rank: int
 
     def is_same_group_size(self, a: ScatterMode, b: ScatterMode):
         return self.process_group_sizes[a] == self.process_group_sizes[b]
@@ -432,6 +587,7 @@ def init_new(cls):
         attn_tp_size = get_attention_tp_size()
         attn_dp_size = get_attention_dp_size()
         tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
         process_group_sizes = {
             ScatterMode.SCATTERED: 1,
             ScatterMode.TP_ATTN_FULL: attn_tp_size,
@@ -444,6 +600,7 @@ def init_new(cls):
             attn_tp_size=attn_tp_size,
             attn_dp_size=attn_dp_size,
             tp_size=tp_size,
+            tp_rank=tp_rank,
         )
 
 
@@ -566,6 +723,14 @@ def _gather_hidden_states_and_residual(
         *,
         residual_input_mode,
     ):
+        if get_attn_tp_context().input_scattered:
+            return CommunicateWithAllReduceAndLayerNormFn._tp_all_reduce_with_scattered_residual(
+                hidden_states,
+                residual,
+                layernorm,
+                context,
+            )
+
         if residual_input_mode == ScatterMode.SCATTERED and context.attn_tp_size > 1:
             residual, local_residual = (
                 get_local_dp_buffer(),
@@ -637,6 +802,22 @@ def _scatter_hidden_states_and_residual(
             hidden_states, residual = layernorm(hidden_states, residual)
         return hidden_states, residual
 
+    @staticmethod
+    def _tp_all_reduce_with_scattered_residual(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        layernorm: torch.nn.Module,
+        context: CommunicateContext,
+    ):
+        if hidden_states.shape[0] == 0:
+            return hidden_states, hidden_states
+
+        scattered_states = hidden_states.tensor_split(context.tp_size)[context.tp_rank]
+        scattered_states += residual
+        residual = tensor_model_parallel_all_reduce(hidden_states)
+        hidden_states = layernorm(residual)
+        return hidden_states, residual
+
 
 class CommunicateSummableTensorPairFn:
     """It is allowed to make (hidden_states, residual) := (hidden_states + residual, None) if needed."""

@@ -18,6 +18,7 @@
     use_symmetric_memory,
 )
 from sglang.srt.layers.amx_utils import PackWeightMethod
+from sglang.srt.layers.communicator import get_attn_tp_context
 from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
 from sglang.srt.layers.parameter import BasevLLMParameter
 from sglang.srt.layers.quantization.base_config import (
@@ -478,11 +479,10 @@ def forward(self, input_):
         # Mask the output embedding.
         if self.tp_size > 1:
             output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)
-            # Reduce across all the model parallel GPUs.
-            output = tensor_model_parallel_all_reduce(output_parallel)
-        else:
-            output = output_parallel
-        return output
+            if not get_attn_tp_context().input_scattered:
+                # Reduce across all the model parallel GPUs.
+                output_parallel = tensor_model_parallel_all_reduce(output_parallel)
+        return output_parallel
 
     def extra_repr(self) -> str:
         s = f"num_embeddings={self.num_embeddings_per_partition}"

@@ -38,7 +38,10 @@
 import triton
 import triton.language as tl
 
-from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size
+from sglang.srt.distributed.parallel_state import (
+    get_moe_expert_parallel_world_size,
+    get_tensor_model_parallel_world_size,
+)
 from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
 from sglang.srt.layers.dp_attention import (
     DpPaddingMode,
@@ -766,6 +769,13 @@ def prepare_mlp_sync_batch(self, model_runner: ModelRunner):
                 else:
                     bs = self.batch_size = num_tokens
 
+        # padding
+        self._pad_inputs_to_size(model_runner, num_tokens, bs)
+        self.global_num_tokens_cpu = global_num_tokens
+        global_num_tokens_pinned = torch.tensor(global_num_tokens, pin_memory=True)
+        self.global_num_tokens_gpu.copy_(global_num_tokens_pinned, non_blocking=True)
+
+    def _pad_inputs_to_size(self, model_runner: ModelRunner, num_tokens, bs):
         # padding
         self.input_ids = self._pad_tensor_to_size(self.input_ids, num_tokens)
         self.req_pool_indices = self._pad_tensor_to_size(self.req_pool_indices, bs)
@@ -788,9 +798,6 @@ def prepare_mlp_sync_batch(self, model_runner: ModelRunner):
         if self.encoder_lens is not None:
             self.encoder_lens = self._pad_tensor_to_size(self.encoder_lens, bs)
         self.positions = self._pad_tensor_to_size(self.positions, num_tokens)
-        self.global_num_tokens_cpu = global_num_tokens
-        global_num_tokens_pinned = torch.tensor(global_num_tokens, pin_memory=True)
-        self.global_num_tokens_gpu.copy_(global_num_tokens_pinned, non_blocking=True)
 
         if self.mrope_positions is not None:
             self.mrope_positions = self._pad_tensor_to_size(self.mrope_positions, bs)
@@ -818,6 +825,19 @@ def prepare_mlp_sync_batch(self, model_runner: ModelRunner):
                 spec_info.hidden_states, num_tokens
             )
 
+    def prepare_attn_tp_scatter_input(self, model_runner: ModelRunner):
+        from sglang.srt.layers.communicator import get_attn_tp_context
+
+        attn_tp_context = get_attn_tp_context()
+        input_scattered = attn_tp_context.use_input_scattered(self)
+        if not input_scattered:
+            return
+        assert self.forward_mode.is_extend()
+        tokens = self.input_ids.shape[0]
+        rank_size = get_tensor_model_parallel_world_size()
+        tokens_padded = (tokens + rank_size - 1) // rank_size * rank_size
+        self._pad_inputs_to_size(model_runner, tokens_padded, self.batch_size)
+
     def post_forward_mlp_sync_batch(self, logits_output: LogitsProcessorOutput):
 
         self.forward_mode = getattr(self, "_original_forward_mode", self.forward_mode)