NVIDIA · Njuapp · Sep 5, 2025 · Sep 5, 2025 · coderabbitai · Sep 4, 2025
@@ -70,6 +70,9 @@
 from .modeling_utils import (DecoderModel, EagerFusionConfig, filter_weights,
                              register_auto_model)
 
+from ..distributed import allgather
+import os
+from tensorrt_llm.mapping import Mapping
 
 @triton.jit
 def weight_dequant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_SIZE: tl.constexpr):
@@ -166,10 +169,31 @@ def forward(self,
             else:
                 hidden_states = hidden_states[-1].unsqueeze(0)
 
-        if not (self.model_config.mapping.enable_attention_dp):
+        # Add pre-lm gather logic
+        if (self.model_config.mapping.enable_attention_dp and 
+            getattr(self.model_config.mapping, 'enable_lm_tp_in_adp', False)):
+            # ADP + LM TP mode: perform All-Gather before LM_head
+            lm_tp_size = int(os.getenv('LM_TP_SIZE', 2))
+            assert self.model_config.mapping.tp_size % lm_tp_size == 0
+            lm_pp_size = self.model_config.mapping.pp_size * self.model_config.mapping.tp_size // lm_tp_size
+            mapping_lm_tp = Mapping(
+                world_size=lm_tp_size * lm_pp_size,
+                rank=self.model_config.mapping.rank,
+                gpus_per_node=self.model_config.mapping.gpus_per_node,
+                tp_size=lm_tp_size,
+                pp_size=lm_pp_size,
+                enable_attention_dp=self.model_config.mapping.enable_attention_dp,
+                enable_lm_tp_in_adp=self.model_config.mapping.enable_lm_tp_in_adp,
+            )
+            hidden_states = allgather(hidden_states, mapping_lm_tp, dim=0)
+
+        # Temporarily disable gather_output when not in ADP mode or (in ADP mode and LM TP is enabled)
+        if (not self.model_config.mapping.enable_attention_dp) or (self.model_config.mapping.enable_attention_dp and
+                getattr(self.model_config.mapping, 'enable_lm_tp_in_adp', False)):
             lm_head.gather_output = False
-        logits = lm_head(hidden_states)
-        if not (self.model_config.mapping.enable_attention_dp):
+        logits = lm_head(hidden_states, is_mtp_head=True)
+        if (not self.model_config.mapping.enable_attention_dp) or (self.model_config.mapping.enable_attention_dp and
+                getattr(self.model_config.mapping, 'enable_lm_tp_in_adp', False)):
             lm_head.gather_output = True
         return logits
 

@@ -352,7 +352,7 @@ def __init__(self, model: TModel, *, config: ModelConfig[TConfig],
         self.pp_size = config.mapping.pp_size
         self.has_custom_lm_head = False
 
-        if config.mapping.enable_attention_dp:
+        if config.mapping.enable_attention_dp and not getattr(config.mapping, 'enable_lm_tp_in_adp', False):
             self.lm_head = LMHead(
                 vocab_size,
                 hidden_size,

@@ -1,4 +1,5 @@
 import math
+import os
 from typing import Dict, List, Optional, Tuple
 
 import torch
@@ -35,6 +36,21 @@ def __init__(
         local_in_features = embedding_dim
         local_out_features = num_embeddings
         mapping = mapping or Mapping()
+        if (mapping.enable_attention_dp and 
+            getattr(mapping, 'enable_lm_tp_in_adp', False)):
+            lm_tp_size = int(os.getenv('LM_TP_SIZE', 2))
+            assert mapping.tp_size % lm_tp_size == 0, f"mapping.tp_size % lm_tp_size == 0, {mapping.tp_size} % {lm_tp_size} != 0"
+            lm_pp_size = mapping.pp_size * mapping.tp_size // lm_tp_size
+            mapping = Mapping(
+                world_size=lm_tp_size * lm_pp_size,
+                rank=mapping.rank,
+                gpus_per_node=mapping.gpus_per_node,
+                tp_size=lm_tp_size,
+                pp_size=lm_pp_size,
+                enable_attention_dp=mapping.enable_attention_dp,
+                enable_lm_tp_in_adp=mapping.enable_lm_tp_in_adp,
+            )
+
         tp_size = mapping.tp_size
 
         # Attention DP doesn't work with embedding parallelization.
@@ -83,9 +99,23 @@ def forward(
             self,
             input: torch.Tensor,
             *,
-            all_reduce_params: Optional[AllReduceParams] = None
+            all_reduce_params: Optional[AllReduceParams] = None,
+            is_mtp_head: bool = False,
     ) -> torch.Tensor:
-        output = super().forward(input, all_reduce_params=all_reduce_params)
+        if is_mtp_head and (self.mapping.enable_attention_dp and 
+                            getattr(self.mapping, 'enable_lm_tp_in_adp', False)):
+            tp_rank = self.mapping.tp_rank
+            tp_size = self.mapping.tp_size
+            tensor_shape = self.weight.shape
+            width = tensor_shape[0]
+            slice_width = math.ceil(width / tp_size)
+            slice_start = tp_rank * slice_width
+            slice_end = min((tp_rank + 1) * slice_width, width)
+            slice_obj = [slice(None)] * len(tensor_shape)
+            slice_obj[0] = slice(slice_start, slice_end)
+            output = F.linear(input, self.weight[tuple(slice_obj)], None)
+        else:
+            output = super().forward(input, all_reduce_params=all_reduce_params)
         if (self.tp_mode == TensorParallelMode.COLUMN and self.gather_output
                 and self.padding_size > 0):
             output = output[..., :-self.padding_size]

@@ -2,6 +2,7 @@
 from typing import TYPE_CHECKING, List, Optional
 
 import torch
+import torch.nn.functional as F
 from torch import nn
 
 from ..attention_backend import AttentionMetadata
@@ -17,6 +18,8 @@
 if TYPE_CHECKING:
     from tensorrt_llm.llmapi.llm_args import MTPDecodingConfig
 
+import os
+from tensorrt_llm.mapping import Mapping
 
 @dataclass(kw_only=True)
 class SampleStateTensorsMTP(SampleStateTensors):
@@ -473,9 +476,23 @@ def forward(
         for _, mtp_layer in enumerate(draft_model.mtp_layers):
             hidden_states = mtp_layer(embed_tokens=draft_model.embed_tokens,
                                       **draft_inputs)
-            logits = mtp_layer.shared_head(hidden_states, draft_model.lm_head,
+            token_count = hidden_states.view(-1,
+                                             hidden_states.shape[-1]).shape[0]
+            all_rank_max_num_tokens = attn_metadata.all_rank_max_num_tokens
+            pad_len = all_rank_max_num_tokens - token_count
+            if pad_len > 0:
+                padded_hidden_states = F.pad(hidden_states.view(
+                    -1, hidden_states.shape[-1]), (0, 0, 0, pad_len),
+                                             mode="constant",
+                                             value=0)
+            else:
+                padded_hidden_states = hidden_states.view(
+                    -1, hidden_states.shape[-1])
+            logits = mtp_layer.shared_head(padded_hidden_states,
+                                           draft_model.lm_head,
                                            attn_metadata).float()
             new_draft_token = self.draft_sampler(logits)
+            new_draft_token = new_draft_token[:token_count]
             next_draft_tokens.append(new_draft_token)
             # shift input_ids and hidden_states
             input_ids = draft_inputs["input_ids"]
@@ -1041,12 +1058,13 @@ def prepare_drafter_inputs(
         }
 
     @torch.compile(options={"max-autotune": True})
-    def get_local_max_and_combined(self, logits):
+    def get_local_max_and_combined(self, logits, mapping_lm_tp=None):
         local_max_values, local_argmax = torch.max(logits, dim=-1, keepdim=True)
         # Adjust indices based on TP rank and size
         vocab_per_rank = logits.shape[-1]
+        mapping_lm_tp = mapping_lm_tp if mapping_lm_tp is not None else self.model_config.mapping
         max_index_per_rank = local_argmax.type(
-            torch.int32) + (self.model_config.mapping.tp_rank * vocab_per_rank)
+            torch.int32) + (mapping_lm_tp.tp_rank * vocab_per_rank)
         # Use torch.stack and flatten instead of view+cat to avoid torch.compile issues
         # Convert both to float32 to ensure consistent dtype
         max_index_per_rank_float = max_index_per_rank.float()
@@ -1095,6 +1113,32 @@ def draft_sampler(
             combined = self.get_local_max_and_combined(logits)
             gathered = allgather(combined, self.model_config.mapping, dim=-1)
             draft_tokens = self.get_draft_tokens_from_gathered(gathered)
+        elif (self.model_config is not None
+              and hasattr(self.model_config, 'mapping')
+              and self.model_config.mapping.tp_size > 1) and (
+                  self.model_config.mapping.enable_attention_dp and getattr(
+                      self.model_config.mapping, 'enable_lm_tp_in_adp', False)):
+            # For ADP + LM TP mode, we need to find the global argmax across all TP ranks
+            # First, get local argmax and max values
+            lm_tp_size = int(os.getenv('LM_TP_SIZE', 2))
+            assert self.model_config.mapping.tp_size % lm_tp_size == 0
+            lm_pp_size = self.model_config.mapping.pp_size * self.model_config.mapping.tp_size // lm_tp_size
+            mapping_lm_tp = Mapping(
+                world_size=lm_tp_size * lm_pp_size,
+                rank=self.model_config.mapping.rank,
+                gpus_per_node=self.model_config.mapping.gpus_per_node,
+                tp_size=lm_tp_size,
+                pp_size=lm_pp_size,
+                enable_attention_dp=self.model_config.mapping.enable_attention_dp,
+                enable_lm_tp_in_adp=self.model_config.mapping.enable_lm_tp_in_adp,
+            )
+            combined = self.get_local_max_and_combined(logits, mapping_lm_tp)
+            gathered = allgather(combined, mapping_lm_tp, dim=-1)
+            batch_size = logits.shape[0]
+            local_batch_size = batch_size // mapping_lm_tp.tp_size
+            gathered = gathered.view(mapping_lm_tp.tp_size, local_batch_size, -1)
+            sliced_gathered = gathered[mapping_lm_tp.tp_rank]
+            draft_tokens = self.get_draft_tokens_from_gathered(sliced_gathered)
         else:
             # Simple argmax if no TP or no model config
             draft_tokens = torch.argmax(logits, dim=-1).type(torch.int32)
@@ -1194,10 +1238,26 @@ def prepare_position_ids_and_last_tokens(position_ids, attn_metadata):
                     **inputs)
                 # All of the seq_len are 1, use batch_indices_cuda as gather_ids
                 gather_ids = spec_metadata.batch_indices_cuda[:batch_size]
+            hidden_states_gathered = hidden_states[gather_ids]
+            token_count = hidden_states_gathered.view(-1,
+                                             hidden_states_gathered.shape[-1]).shape[0]
+            max_num_requests = spec_metadata.max_num_requests
+            pad_len = max_num_requests - token_count
+            if pad_len > 0:
+                padded_hidden_states = F.pad(hidden_states_gathered.view(
+                    -1, hidden_states_gathered.shape[-1]), (0, 0, 0, pad_len),
+                                             mode="constant",
+                                             value=0)
+            elif pad_len == 0:
+                padded_hidden_states = hidden_states_gathered.view(
+                    -1, hidden_states_gathered.shape[-1])
+            else:
+                raise ValueError(f"In MTPEagleWorker.forward(), token_count < max_num_requests, which is not supported")
             logits = draft_model.mtp_layers[0].shared_head(
-                hidden_states[gather_ids], draft_model.lm_head, attn_metadata,
+                padded_hidden_states, draft_model.lm_head, attn_metadata,
                 True)
             new_draft_token = self.draft_sampler(logits)
+            new_draft_token = new_draft_token[:token_count]
 
             hidden_states, position_ids = self.update_draft_tokens(
                 next_draft_tokens, new_draft_token, hidden_states, gather_ids,

@@ -225,6 +225,7 @@ class _ParallelConfig:
     moe_ep_size: int = 1
     cp_config: dict = field(default_factory=dict)
     enable_attention_dp: bool = False
+    enable_lm_tp_in_adp: bool = False
     auto_parallel: bool = False
 
     _world_size: int = field(default=1, init=False)
@@ -288,6 +289,7 @@ def to_mapping(self) -> Mapping:
                        cp_size=self.cp_size,
                        cp_config=self.cp_config,
                        enable_attention_dp=self.enable_attention_dp,
+                       enable_lm_tp_in_adp=self.enable_lm_tp_in_adp,
                        moe_cluster_size=self.moe_cluster_size,
                        moe_tp_size=self.moe_tp_size,
                        moe_ep_size=self.moe_ep_size,
@@ -1261,6 +1263,11 @@ class BaseLlmArgs(StrictBaseModel):
         description="Enable attention data parallel.",
         status="beta")
 
+    enable_lm_tp_in_adp: bool = Field(
+        default=False,
+        description="Enable lm tp in attention dp.",
+        status="beta")
+
     cp_config: Optional[dict] = Field(default_factory=dict,
                                       description="Context parallel config.",
                                       status="prototype")
@@ -1508,6 +1515,7 @@ def validate_parallel_config(self):
             moe_tp_size=self.moe_tensor_parallel_size,
             moe_ep_size=self.moe_expert_parallel_size,
             enable_attention_dp=self.enable_attention_dp,
+            enable_lm_tp_in_adp=self.enable_lm_tp_in_adp,
             cp_config=self.cp_config)
         return self
 

diff --git a/tensorrt_llm/mapping.py b/tensorrt_llm/mapping.py
@@ -141,7 +141,8 @@ def __init__(
             attn_tp_size=-1,
             attn_cp_size=-1,
             auto_parallel=False,
-            enable_attention_dp=False):
+            enable_attention_dp=False,
+            enable_lm_tp_in_adp=False):
         # set default values for non-moe cases
         # or where only one MOE parallelism size is specified
         if moe_cluster_size == -1:
@@ -224,6 +225,7 @@ def __init__(
         self.auto_parallel = auto_parallel
         self.world_size = world_size
         self.enable_attention_dp = enable_attention_dp
+        self.enable_lm_tp_in_adp = enable_lm_tp_in_adp
         self.rank = rank
         self.gpus_per_node = gpus_per_node
         self.pp_groups = []
@@ -510,4 +512,6 @@ def to_dict(self):
             'attn_cp_size': self.attn_cp_size,
             'cp_config': self.cp_config,
             'auto_parallel': self.auto_parallel,
+            'enable_attention_dp': self.enable_attention_dp,
+            'enable_lm_tp_in_adp': self.enable_lm_tp_in_adp,
         }