remove debug comments

Njuapp · Njuapp · commit eee58cb45943 · 2025-09-05T08:01:46.000Z
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -167,7 +167,7 @@ def forward(self,
             from ..distributed import allgather
             import os
             from tensorrt_llm.mapping import Mapping
-            lm_tp_size = int(os.getenv('LM_TP_SIZE', 8))
+            lm_tp_size = int(os.getenv('LM_TP_SIZE', 2))
             assert self.model_config.mapping.tp_size % lm_tp_size == 0
             lm_pp_size = self.model_config.mapping.pp_size * self.model_config.mapping.tp_size // lm_tp_size
             mapping_lm_tp = Mapping(
diff --git a/tensorrt_llm/_torch/models/modeling_speculative.py b/tensorrt_llm/_torch/models/modeling_speculative.py
@@ -433,11 +433,6 @@ def forward(
                 attn_metadata,
                 True,
             )
-            # print(f"lm_head.weight.data_ptr: {self.lm_head.weight.data_ptr()}")
-            # print(f"lm_head.weight.shape: {self.lm_head.weight.shape}")
-            # print(f"In SpecDecOneEngineForCausalLM, before spec_worker, logits.shape: {logits.shape}")
-            # print(f"draft_model.lm_head.weight.data_ptr: {self.draft_model.lm_head.weight.data_ptr()}")
-            # print(f"draft_model.lm_head.weight.shape: {self.draft_model.lm_head.weight.shape}")
             # get accepted tokens and next draft tokens
             return self.spec_worker(input_ids=input_ids,
                                     position_ids=position_ids,
diff --git a/tensorrt_llm/_torch/models/modeling_utils.py b/tensorrt_llm/_torch/models/modeling_utils.py
@@ -353,14 +353,12 @@ def __init__(self, model: TModel, *, config: ModelConfig[TConfig],
         self.has_custom_lm_head = False
 
         if config.mapping.enable_attention_dp and not getattr(config.mapping, 'enable_lm_tp_in_adp', False):
-            print(f"In DecoderModelForCausalLM, creating LMHead without TP")
             self.lm_head = LMHead(
                 vocab_size,
                 hidden_size,
                 dtype=config.pretrained_config.torch_dtype,
             )
         else:
-            print(f"In DecoderModelForCausalLM, creating LMHead with TP")
             # TODO(zhenhuanc): Currently lm_head Linear will not accept QuantConfig
             # will considering per layer QuantConfig in the future.
             if (hasattr(config, 'lora_config')
diff --git a/tensorrt_llm/_torch/modules/embedding.py b/tensorrt_llm/_torch/modules/embedding.py
@@ -37,11 +37,10 @@ def __init__(
         local_out_features = num_embeddings
         mapping = mapping or Mapping()
         if (mapping.enable_attention_dp and 
-            getattr(mapping, 'enable_lm_tp_in_adp', False)) and os.getenv('LM_TP_SIZE') is not None:
-            lm_tp_size = int(os.getenv('LM_TP_SIZE', 8))
+            getattr(mapping, 'enable_lm_tp_in_adp', False)):
+            lm_tp_size = int(os.getenv('LM_TP_SIZE', 2))
             assert mapping.tp_size % lm_tp_size == 0, f"mapping.tp_size % lm_tp_size == 0, {mapping.tp_size} % {lm_tp_size} != 0"
             lm_pp_size = mapping.pp_size * mapping.tp_size // lm_tp_size
-            # print(f"In LMHead, mapping.tp_group: {mapping.tp_group}")
             mapping = Mapping(
                 world_size=lm_tp_size * lm_pp_size,
                 rank=mapping.rank,
@@ -51,7 +50,6 @@ def __init__(
                 enable_attention_dp=mapping.enable_attention_dp,
                 enable_lm_tp_in_adp=mapping.enable_lm_tp_in_adp,
             )
-            print(f"In LMHead, mapping_lm_tp.tp_group: {mapping.tp_group}")
 
         tp_size = mapping.tp_size
 
@@ -104,7 +102,8 @@ def forward(
             all_reduce_params: Optional[AllReduceParams] = None,
             is_mtp_head: bool = False,
     ) -> torch.Tensor:
-        if is_mtp_head and getattr(self.mapping, 'enable_lm_tp_in_adp', False):
+        if is_mtp_head and (self.mapping.enable_attention_dp and 
+                            getattr(self.mapping, 'enable_lm_tp_in_adp', False)):
             tp_rank = self.mapping.tp_rank
             tp_size = self.mapping.tp_size
             tensor_shape = self.weight.shape
diff --git a/tensorrt_llm/_torch/modules/logits_processor.py b/tensorrt_llm/_torch/modules/logits_processor.py
@@ -9,9 +9,8 @@
 
 class LogitsProcessor(nn.Module):
 
-    def __init__(self, model_config: ModelConfig):
+    def __init__(self):
         super().__init__()
-        self.model_config = model_config
 
     def forward(self,
                 hidden_states: torch.Tensor,
@@ -30,49 +29,6 @@ def forward(self,
             else:
                 hidden_states = hidden_states[-1]
 
-        # token_count = hidden_states.view(-1, hidden_states.shape[-1]).shape[0]
-
-        # # Add pre-lm gather logic
-        # if (self.model_config.mapping.enable_attention_dp and getattr(
-        #         self.model_config.mapping, 'enable_lm_tp_in_adp', False)):
-        #     # ADP + LM TP mode: perform All-Gather before LM_head
-        #     from ..distributed import allgather
-        #     all_rank_max_num_tokens = attn_metadata.all_rank_max_num_tokens
-        #     pad_len = all_rank_max_num_tokens - token_count
-        #     if pad_len > 0:
-        #         padded_hidden_states = F.pad(hidden_states.view(
-        #             -1, hidden_states.shape[-1]), (0, 0, 0, pad_len),
-        #                                      mode="constant",
-        #                                      value=0)
-        #     else:
-        #         padded_hidden_states = hidden_states.view(
-        #             -1, hidden_states.shape[-1])
-        #     hidden_states = allgather(padded_hidden_states,
-        #                               self.model_config.mapping,
-        #                               dim=0)
-
-        # # Temporarily disable gather_output when not in ADP mode or (in ADP mode and LM TP is enabled)
-        # if (not self.model_config.mapping.enable_attention_dp) or (
-        #         self.model_config.mapping.enable_attention_dp and getattr(
-        #             self.model_config.mapping, 'enable_lm_tp_in_adp', False)):
-        #     lm_head.gather_output = False
         logits = lm_head(hidden_states)
-        # if (not self.model_config.mapping.enable_attention_dp) or (
-        #         self.model_config.mapping.enable_attention_dp and getattr(
-        #             self.model_config.mapping, 'enable_lm_tp_in_adp', False)):
-        #     lm_head.gather_output = True
-        
-        # if (self.model_config.mapping.enable_attention_dp and getattr(
-        #         self.model_config.mapping, 'enable_lm_tp_in_adp', False)):
-        #     # print(f"In LogitsProcessor, lm_head.weight.data_ptr: {lm_head.weight.data_ptr()}")
-        #     # print(f"In LogitsProcessor, lm_head.weight.shape: {lm_head.weight.shape}")
-        #     # print(f"In LogitsProcessor, logits.shape: {logits.shape}")
-        #     logits = allgather(logits, self.model_config.mapping, dim=-1)
-        #     batch_size = logits.shape[0]
-        #     local_batch_size = batch_size // self.model_config.mapping.tp_size
-        #     logits = logits.view(self.model_config.mapping.tp_size,
-        #                         local_batch_size, -1)
-        #     logits = logits[self.model_config.mapping.tp_rank][:token_count]
-        # print(f"In LogitsProcessor, final logits.shape: {logits.shape}")
         logits = logits.float()
         return logits
diff --git a/tensorrt_llm/_torch/speculative/mtp.py b/tensorrt_llm/_torch/speculative/mtp.py
@@ -858,12 +858,7 @@ def sample_and_accept_draft_tokens(
 
                 # context
                 accepted_tokens[:num_contexts, 0] = target_tokens[:num_contexts]
-                # print(
-                #     f"In sample_and_accept_draft_tokens, accepted_tokens.shape: {accepted_tokens.shape}, num_contexts: {num_contexts}, mapping: {self.model_config.mapping.tp_rank}"
-                # )
-                # print(
-                #     f"In sample_and_accept_draft_tokens, target_tokens.shape: {target_tokens.shape}, num_gens: {num_gens}, mtp_num_modules: {mtp_num_modules}, mapping: {self.model_config.mapping.tp_rank}"
-                # )
+
                 # generation
                 gen_target_tokens = target_tokens[num_contexts:].reshape(
                     num_gens, mtp_num_modules + 1)
@@ -1123,10 +1118,9 @@ def draft_sampler(
                       self.model_config.mapping, 'enable_lm_tp_in_adp', False)):
             # For ADP + LM TP mode, we need to find the global argmax across all TP ranks
             # First, get local argmax and max values
-            # print(f"In draft_sampler, initial logits.shape: {logits.shape}")
             import os
             from tensorrt_llm.mapping import Mapping
-            lm_tp_size = int(os.getenv('LM_TP_SIZE', 8))
+            lm_tp_size = int(os.getenv('LM_TP_SIZE', 2))
             assert self.model_config.mapping.tp_size % lm_tp_size == 0
             lm_pp_size = self.model_config.mapping.pp_size * self.model_config.mapping.tp_size // lm_tp_size
             mapping_lm_tp = Mapping(
@@ -1145,8 +1139,6 @@ def draft_sampler(
             gathered = gathered.view(mapping_lm_tp.tp_size, local_batch_size, -1)
             sliced_gathered = gathered[mapping_lm_tp.tp_rank]
             draft_tokens = self.get_draft_tokens_from_gathered(sliced_gathered)
-            # draft_tokens = torch.argmax(sliced_gathered,
-            #                             dim=-1).type(torch.int32)
         else:
             # Simple argmax if no TP or no model config
             draft_tokens = torch.argmax(logits, dim=-1).type(torch.int32)
@@ -1246,9 +1238,6 @@ def prepare_position_ids_and_last_tokens(position_ids, attn_metadata):
                     **inputs)
                 # All of the seq_len are 1, use batch_indices_cuda as gather_ids
                 gather_ids = spec_metadata.batch_indices_cuda[:batch_size]
-            # print(f"In MTPEagleWorker, hidden_states.shape: {hidden_states.shape}, hidden_states[gather_ids].shape: {hidden_states[gather_ids].shape}")
-            # print(f"In MTPEagleWorker, gather_ids.shape: {gather_ids.shape}, mapping: {self.model_config.mapping.tp_rank}")
-            # print(f"In MTPEagleWorker, spec_metadata.max_num_requests: {spec_metadata.max_num_requests}, mapping: {self.model_config.mapping.tp_rank}")
             hidden_states_gathered = hidden_states[gather_ids]
             token_count = hidden_states_gathered.view(-1,
                                              hidden_states_gathered.shape[-1]).shape[0]