NVIDIA
diff --git a/‎tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py‎
Lines changed: 6 additions & 7 deletions b/‎tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎tensorrt_llm/_torch/models/modeling_deepseekv3.py‎
Lines changed: 1 addition & 1 deletion b/‎tensorrt_llm/_torch/models/modeling_deepseekv3.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 6 additions & 9 deletions b/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 6 additions & 9 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py‎
Lines changed: 5 additions & 2 deletions b/‎tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎
Lines changed: 17 additions & 13 deletions b/‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎
Lines changed: 17 additions & 13 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/py_executor.py‎
Lines changed: 10 additions & 8 deletions b/‎tensorrt_llm/_torch/pyexecutor/py_executor.py‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/py_executor_creator.py‎
Lines changed: 7 additions & 4 deletions b/‎tensorrt_llm/_torch/pyexecutor/py_executor_creator.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/sampler.py‎
Lines changed: 3 additions & 3 deletions b/‎tensorrt_llm/_torch/pyexecutor/sampler.py‎
Lines changed: 3 additions & 3 deletions
@@ -320,13 +320,11 @@ def create_autodeploy_executor(ad_config: LlmArgs):
     max_draft_len = (
         0 if ad_config.speculative_config is None else ad_config.speculative_config.max_draft_len
     )
-    max_total_draft_tokens = 0
-    if ad_config.speculative_config is None:
-        max_total_draft_tokens = 0
-    elif hasattr(ad_config.speculative_config, "max_total_draft_tokens"):
-        max_total_draft_tokens = ad_config.speculative_config.max_total_draft_tokens
-    else:
-        max_total_draft_tokens = max_draft_len
+    max_total_draft_tokens = (
+        0
+        if ad_config.speculative_config is None
+        else ad_config.speculative_config.max_total_draft_tokens
+    )
 
     # initialize model engine
     engine = ADEngine.build_from_config(ad_config=ad_config)
@@ -417,6 +415,7 @@ def create_autodeploy_executor(ad_config: LlmArgs):
         max_input_len=ad_config.max_input_len,
         max_batch_size=ad_config.max_batch_size,
         max_draft_len=max_draft_len,
+        max_total_draft_tokens=max_total_draft_tokens,
         max_beam_width=ad_config.max_beam_width,
     )
     return py_executor
@@ -510,7 +510,7 @@ def __init__(
         aux_stream: Optional[torch.cuda.Stream] = None,
     ):
         config = model_config.pretrained_config
-        predicted_tokens_per_seq = model_config.spec_config.max_draft_len + 1 if model_config.spec_config is not None else 1
+        predicted_tokens_per_seq = model_config.spec_config.max_total_draft_tokens + 1 if model_config.spec_config is not None else 1
         super().__init__(hidden_size=config.hidden_size,
                          num_attention_heads=config.num_attention_heads,
                          num_key_value_heads=config.num_key_value_heads,
 
@@ -250,10 +250,10 @@ def _get_token_num_for_estimation(self) -> int:
         if not pytorch_backend_config.disable_overlap_scheduler:
             num_extra_tokens_per_seq = num_extra_tokens_per_seq + 1
             if spec_cfg is not None:
-                num_extra_tokens_per_seq += spec_cfg.max_draft_len
+                num_extra_tokens_per_seq += spec_cfg.max_total_draft_tokens
 
         if spec_cfg is not None:
-            num_extra_tokens_per_seq += spec_cfg.max_draft_len
+            num_extra_tokens_per_seq += spec_cfg.max_total_draft_tokens
             num_extra_tokens_per_seq += get_num_extra_kv_tokens(spec_cfg)
 
         if self._dummy_reqs is None:
@@ -808,6 +808,8 @@ def create_py_executor_instance(
         max_beam_width=max_beam_width,
         max_draft_len=spec_config.max_draft_len
         if spec_config is not None else 0,
+        max_total_draft_tokens=spec_config.max_total_draft_tokens
+        if spec_config is not None else 0,
         kv_cache_transceiver=kv_cache_transceiver,
         guided_decoder=guided_decoder,
         start_worker=start_worker,
@@ -824,13 +826,8 @@ def create_torch_sampler_args(mapping: Mapping, *, max_seq_len: int,
     max_num_sequences = max_batch_size * mapping.pp_size
     max_draft_len = (0 if speculative_config is None else
                      speculative_config.max_draft_len)
-    max_total_draft_tokens = 0
-    if speculative_config is None:
-        max_total_draft_tokens = 0
-    elif hasattr(speculative_config, 'max_total_draft_tokens'):
-        max_total_draft_tokens = speculative_config.max_total_draft_tokens
-    else:
-        max_total_draft_tokens = max_draft_len
+    max_total_draft_tokens = (0 if speculative_config is None else
+                              speculative_config.max_total_draft_tokens)
 
     return TorchSampler.Args(
         max_seq_len=max_seq_len,
 
@@ -93,7 +93,8 @@ def enable_spec_decode(self):
     @property
     def max_possible_draft_len(self):
         engine = self._get_engine()
-        return (engine.original_max_draft_len if self.enable_spec_decode else 0)
+        return (engine.original_max_total_draft_tokens
+                if self.enable_spec_decode else 0)
 
     def get_graph_key(
             self,
@@ -102,10 +103,12 @@ def get_graph_key(
         engine = self._get_engine()
         if engine.is_draft_model and spec_resource_manager is not None and isinstance(
                 spec_resource_manager, Eagle3ResourceManager):
+            # If 'is_first_draft' is True, even with tree decoding, the length of draft_len will only be 'max_draft_len', not 'max_total_draft_token'.
+            # Because we will pad the input to 'max_draft_len' length for the first draft layer.
             draft_len = engine.original_max_draft_len if spec_resource_manager.is_first_draft else 0
             key = (batch_size, draft_len, spec_resource_manager.is_first_draft)
         else:
-            draft_len = self.spec_config.max_draft_len if self.enable_spec_decode else 0
+            draft_len = self.spec_config.max_total_draft_tokens if self.enable_spec_decode else 0
             key = (batch_size, draft_len, False)
         return key
 
 
@@ -93,11 +93,11 @@ def warmup(self, resource_manager: ResourceManager) -> None:
 
 def _filter_cuda_graph_batch_sizes(cuda_graph_batch_sizes: list[int],
                                    max_batch_size: int, max_num_tokens: int,
-                                   max_draft_len: int,
+                                   max_total_draft_tokens: int,
                                    enable_padding: bool) -> list[int]:
     # This is the largest possible batch size for a pure decoding batch.
     max_cuda_graph_bs = min(max_batch_size,
-                            int(max_num_tokens / (1 + max_draft_len)))
+                            int(max_num_tokens / (1 + max_total_draft_tokens)))
 
     result = []
     # This function assumes cuda_graph_batch_sizes is sorted
@@ -162,11 +162,13 @@ def __init__(
             ExpertStatistic.create(self.dist.rank)
         self.pytorch_backend_config = pytorch_backend_config
         self.original_max_draft_len = spec_config.max_draft_len if spec_config is not None else 0
+        self.original_max_total_draft_tokens = spec_config.max_total_draft_tokens if spec_config is not None else 0
 
         # The draft model won't have any draft tokens attached to
         # generation requests when we invoke it autoregressively
         if spec_config is not None and is_draft_model:
             spec_config.max_draft_len = 0
+            spec_config.max_total_draft_tokens = 0
         self.spec_config = spec_config
         self.is_spec_decode = spec_config is not None
         self.sparse_attention_config = sparse_attention_config
@@ -277,7 +279,7 @@ def __init__(
             self.spec_metadata = None
             update_spec_config_from_model_config(self.spec_config,
                                                  self.model.config)
-            max_num_draft_tokens = self.original_max_draft_len * batch_size
+            max_num_draft_tokens = self.original_max_total_draft_tokens * batch_size
             self.draft_tokens_cuda = torch.empty((max_num_draft_tokens, ),
                                                  dtype=torch.int,
                                                  device='cuda')
@@ -297,9 +299,11 @@ def __init__(
             self.without_logits = self.spec_config.spec_dec_mode.without_logits(
             ) or self.model_is_wrapped
             self.max_draft_len = spec_config.max_draft_len
+            self.max_total_draft_tokens = spec_config.max_total_draft_tokens
         else:
             self.without_logits = False
             self.max_draft_len = 0
+            self.max_total_draft_tokens = 0
 
         self.guided_decoder: Optional[CapturableGuidedDecoder] = None
 
@@ -320,7 +324,7 @@ def __init__(
 
         self._cuda_graph_batch_sizes = _filter_cuda_graph_batch_sizes(
             pytorch_backend_config.cuda_graph_batch_sizes, self.batch_size,
-            self.max_num_tokens, self.original_max_draft_len,
+            self.max_num_tokens, self.original_max_total_draft_tokens,
             self._cuda_graph_padding_enabled
         ) if pytorch_backend_config.cuda_graph_batch_sizes else []
 
@@ -364,7 +368,7 @@ def register_forward_pass_callable(self, callable: Callable):
 
     @property
     def runtime_draft_len(self):
-        return self.max_draft_len if self.enable_spec_decode else 0
+        return self.max_total_draft_tokens if self.enable_spec_decode else 0
 
     def set_lora_model_config(self,
                               lora_target_modules: list[str],
@@ -585,20 +589,20 @@ def _capture_generation_cuda_graphs(self,
             if self.model_is_wrapped and self.is_spec_decode and spec_resource_manager is not None and isinstance(
                     spec_resource_manager, Eagle3ResourceManager):
                 # The CDL path uses draft_len > 0 for the number of iterations in the drafting loop.
-                draft_lengths.append(self.original_max_draft_len)
+                draft_lengths.append(self.original_max_total_draft_tokens)
             else:
-                draft_lengths.append(self.max_draft_len)
+                draft_lengths.append(self.max_total_draft_tokens)
         else:
             # For non-draft model, we also capture the CUDA graph instance for draft length 0,
             # so that when we disable spec decode at runtime, we can still run the captured graph.
             # Note that for one engine mode, we are not able to turn off spec decode at runtime.
-            if (self.max_draft_len > 0
+            if (self.max_total_draft_tokens > 0
                     and not self.spec_config.spec_dec_mode.use_one_engine()
                     # Assume that speculation is always on if the user didn't give us a max_concurrency
                     # value. This will save on memory.
                     and self.spec_config.max_concurrency is not None):
                 draft_lengths.append(0)
-            draft_lengths = [self.max_draft_len]
+            draft_lengths = [self.max_total_draft_tokens]
 
         for bs in cuda_graph_batch_sizes:
             if bs > self.batch_size:
@@ -757,7 +761,7 @@ def _create_warmup_request(
                            num_ctx_requests + num_gen_tokens)),
                 token_nums=[1] * num_gen_tokens,
                 is_gen=True,
-                max_num_draft_tokens=self.max_draft_len,
+                max_num_draft_tokens=self.max_total_draft_tokens,
                 use_mrope=self.use_mrope)
             if spec_resource_manager is not None:
                 spec_resource_manager.add_dummy_requests(request_ids=list(
@@ -830,7 +834,7 @@ def _create_cuda_graph_warmup_request(
     def _get_cuda_graph_draft_lengths(
             self, resource_manager: ResourceManager) -> List[int]:
         """Determines the draft lengths for which to capture CUDA graphs."""
-        draft_lengths = [self.max_draft_len]
+        draft_lengths = [self.max_total_draft_tokens]
         spec_resource_manager = resource_manager.get_resource_manager(
             ResourceManagerType.SPEC_RESOURCE_MANAGER)
 
@@ -1027,7 +1031,7 @@ def _preprocess_inputs(self, inputs: Dict[str, Any]):
         """
         if self.enable_spec_decode and not self._disable_overlap_scheduler:
             # When enabling overlap scheduler, the kv cache for draft tokens will
-            # be prepared in advance by using the max_draft_len. But we need to use
+            # be prepared in advance by using the max_total_draft_tokens. But we need to use
             # new_tokens_lens_device to get the real past kv lengths and the
             # correct position ids. And to avoid blocking the async data transfer,
             # we need to preprocess the inputs in forward to update the position_ids and
@@ -2252,7 +2256,7 @@ def forward(
             # attn_metadata now depends on spec_metadata since it determines the shape/content of spec_dec parameter Tensors
             is_spec_dec_mode = spec_metadata.spec_dec_mode.attention_need_spec_dec_mode(
                 spec_resource_manager, self.is_draft_model, self.attn_backend,
-                self.model_is_wrapped)
+                self.model_is_wrapped, spec_metadata.is_spec_dec_tree)
             attn_metadata.update_spec_dec_param(
                 is_spec_dec_mode, spec_metadata.is_spec_dec_tree,
                 spec_metadata.is_spec_dec_dynamic_tree,
 
@@ -160,6 +160,7 @@ def __init__(self,
                  max_batch_size: int = 8,
                  max_beam_width: int = 1,
                  max_draft_len: int = 0,
+                 max_total_draft_tokens: int = 0,
                  kv_cache_transceiver: Optional[KvCacheTransceiver] = None,
                  guided_decoder: Optional[GuidedDecoder] = None,
                  garbage_collection_gen0_threshold: Optional[int] = None,
@@ -195,6 +196,7 @@ def __init__(self,
         self.active = True
         self.max_beam_width = max_beam_width
         self.max_draft_len = max_draft_len
+        self.max_total_draft_tokens = max_total_draft_tokens
         self.max_num_tokens = model_engine.pytorch_backend_config.max_num_tokens
         self.print_log = model_engine.pytorch_backend_config.print_iter_log
         self.enable_iter_perf_stats = model_engine.pytorch_backend_config.enable_iter_perf_stats
@@ -1040,7 +1042,7 @@ def _prepare_and_schedule_batch(self):
                 self.use_spec_decode = self.drafter.should_use_spec_decode(
                     self.active_requests, self.max_batch_size,
                     self.model_engine.max_num_tokens,
-                    self.model_engine.spec_config.max_draft_len)
+                    self.model_engine.spec_config.max_total_draft_tokens)
             logger.debug(f"Use spec decode: {self.use_spec_decode}")
             self.model_engine.enable_spec_decode = self.use_spec_decode
 
@@ -1050,10 +1052,10 @@ def _prepare_and_schedule_batch(self):
                         LlmRequestState.GENERATION_IN_PROGRESS,
                         LlmRequestState.DISAGG_GENERATION_INIT):
                     continue
-                max_draft_len = self.model_engine.spec_config.max_draft_len
+                max_total_draft_tokens = self.model_engine.spec_config.max_total_draft_tokens
                 request.draft_tokens = [
                     0
-                ] * max_draft_len if max_draft_len > 0 else []
+                ] * max_total_draft_tokens if max_total_draft_tokens > 0 else []
 
             # When overlap scheduler is enabled, and we already prepared the draft tokens in the previous batch,
             # we don't need to initialize py_draft_tokens at this stage because we haven't append the accepted tokens to the request yet.
@@ -1223,11 +1225,11 @@ def _prepare_draft_requests(self):
                     continue
 
                 req.py_last_draft_tokens = req.py_draft_tokens
-                max_draft_len = self.model_engine.spec_config.max_draft_len
+                max_total_draft_tokens = self.model_engine.spec_config.max_total_draft_tokens
 
-                if max_draft_len > 0 and self.use_spec_decode:
-                    req.py_draft_tokens = [0] * max_draft_len
-                    req.py_draft_pages_allocated = max_draft_len
+                if max_total_draft_tokens > 0 and self.use_spec_decode:
+                    req.py_draft_tokens = [0] * max_total_draft_tokens
+                    req.py_draft_pages_allocated = max_total_draft_tokens
                 else:
                     req.py_draft_tokens = []
                     req.py_draft_pages_allocated = 0
@@ -1615,7 +1617,7 @@ def _pad_attention_dp_dummy_request(self):
                 request_ids=[0],
                 is_gen=True,
                 prepare_resource=True,
-                max_num_draft_tokens=self.max_draft_len,
+                max_num_draft_tokens=self.max_total_draft_tokens,
             )[0]
             llm_request.is_attention_dp_dummy = True
             spec_resource_manager = self.resource_manager.get_resource_manager(
 
@@ -357,7 +357,9 @@ def drafting_loop_wrapper(model):
                     from tensorrt_llm._torch.speculative.drafting_loops import \
                         ChainDrafter
 
-                    return ChainDrafter(spec_config.max_draft_len, model)
+                    return ChainDrafter(spec_config.max_draft_len,
+                                        spec_config.max_total_draft_tokens,
+                                        model)
             else:
                 drafting_loop_wrapper = None
 
@@ -397,11 +399,11 @@ def drafting_loop_wrapper(model):
     if not pytorch_backend_config.disable_overlap_scheduler:
         model_engine_max_seq_len = model_engine.max_seq_len + 1
         if spec_config is not None:
-            model_engine_max_seq_len += spec_config.max_draft_len
+            model_engine_max_seq_len += spec_config.max_total_draft_tokens
 
     if spec_config is not None:
         model_engine_max_seq_len += get_num_extra_kv_tokens(spec_config)
-        model_engine_max_seq_len += spec_config.max_draft_len
+        model_engine_max_seq_len += spec_config.max_total_draft_tokens
 
     max_seq_len = model_engine_max_seq_len
     max_num_tokens = model_engine.max_num_tokens
@@ -471,7 +473,8 @@ def drafting_loop_wrapper(model):
                     "vocab_size_padded": model_engine.model.vocab_size_padded
                 }
                 if spec_config is not None:
-                    kwargs["max_num_draft_tokens"] = spec_config.max_draft_len
+                    kwargs[
+                        "max_num_draft_tokens"] = spec_config.max_total_draft_tokens
 
                 if spec_config is None or spec_config.spec_dec_mode.support_guided_decoder(
                 ):
 
@@ -589,7 +589,7 @@ class Args:
 
     def __init__(self, args: Args):
         self.max_seq_len = args.max_seq_len
-        self.max_tokens = args.max_draft_len + 1
+        self.max_tokens = args.max_total_draft_tokens + 1
         assert args.max_beam_width == self.MAX_BEAM_WIDTH, (
             "TorchSampler only supports beam_width = 1"
         )
@@ -738,9 +738,9 @@ def _process_draft_tokens_tree(
             we can find the longest match by comparing all the paths.
         Args:
             request: LlmRequest. The request with draft tokens.
-            new_tokens: torch.Tensor. [max_draft_len + 1, max_num_sequences, MAX_BEAM_WIDTH], host buffer.
+            new_tokens: torch.Tensor. [max_total_draft_tokens + 1, max_num_sequences, MAX_BEAM_WIDTH], host buffer.
                         The tokens generated by the target model
-                        The relationship between [max_draft_len + 1] and the draft token tree:
+                        The relationship between [max_total_draft_tokens + 1] and the draft token tree:
                         If the current node is accepted, what is the NEXT token_id that the target model will generate?
                         For example, new_tokens[0, req_idx, 1] indicates the NEXT token_id sampled from the root
                         node in the draft token tree if it is accepted.