Clean up the code

ziyixiong-nv · ziyixiong-nv · commit ef58f157b9ae · 2025-09-11T22:30:10.000-07:00
Signed-off-by: ziyixiong-nv &lt;219238287+ziyixiong-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -44,7 +44,7 @@
 from .kv_cache_connector import KvCacheConnectorManager
 from .kv_cache_transceiver import KvCacheTransceiver
 from .llm_request import (ExecutorRequest, LlmRequest, LlmRequestState,
-                          LlmResponse, get_draft_token_length)
+                          LlmResponse)
 from .model_engine import ModelEngine
 from .sampler import Sampler, SampleState, SampleStateTensors
 from .scheduler import RequestScheduler, ScheduledRequests
@@ -1067,14 +1067,9 @@ def _executor_loop(self):
                                 scheduled_requests=scheduled_batch):
                             self.drafter.prepare_draft_tokens(
                                 scheduled_batch, self.resource_manager)
-                            # Pad draft tokens to the max draft length. This is for CUDA
-                            # graph compatibility.
-                            for req in scheduled_batch.generation_requests:
-                                max_draft_tokens = self.max_draft_len
-                                num_draft_tokens = get_draft_token_length(req)
-                                req.py_draft_tokens.extend(
-                                    0 for _ in range(max_draft_tokens -
-                                                     num_draft_tokens))
+                            # Pad draft tokens to the max draft length. This is for CUDA graph compatibility.
+                            self.drafter.pad_draft_tokens_for_cuda_graph(
+                                scheduled_batch)
                         # add_batch must be called again to restore to target requests with updated draft tokens.
                         if self.guided_decoder is not None:
                             self.guided_decoder.add_batch(scheduled_batch)
@@ -1207,6 +1202,8 @@ def _executor_loop_overlap(self):
                         target_inputs, draft_outputs, draft_batch = self._handle_speculative_decoding(
                             scheduled_batch, previous_tensors)
 
+                    # Use the draft_model's outputs if we've launched the draft model.
+                    # Otherwise, use the previous batch's outputs.
                     if target_inputs is not None:
                         previous_tensors_device = target_inputs
                     else:
@@ -1971,17 +1968,10 @@ def _remove_inflight_ids(self, scheduled_requests):
             self.inflight_req_ids.erase(req.request_id)
 
     def _handle_speculative_decoding(self, scheduled_batch, previous_tensors):
-        """
-        Handle speculative decoding logic.
-
-        Args:
-            scheduled_batch: The scheduled batch to process
-            previous_tensors: Previous iteration tensors
-
-        Returns:
-            Tuple of (target_inputs, draft_outputs, draft_batch)
-        """
         with request_context(is_draft=True, scheduled_requests=scheduled_batch):
+            # Do an early checking to see if we need to forward the draft model.
+            # If needed, the overlap should happen between the target requests and the draft requests.
+            # Otherwise, we can still do overlap between the previous target requests and the current target requests.
             has_draft_batch = (
                 self.previous_batch is not None
                 and self.drafter.should_forward_draft_model(scheduled_batch))
@@ -2006,60 +1996,29 @@ def _handle_speculative_decoding(self, scheduled_batch, previous_tensors):
     def _process_draft_results(self, scheduled_batch, draft_outputs,
                                draft_batch):
         """
-        Process the results from draft model execution.
-
-        Args:
-            scheduled_batch: The scheduled batch
-            draft_outputs: The outputs from the draft model
-            draft_batch: The draft batch that was processed
+        Append the draft tokens to the target requests, and clean up the draft resources.
         """
         req_id_to_old_request = {
             req.py_request_id: req
             for req in scheduled_batch.all_requests()
         }
 
         if self.drafter.use_static_draft_loop:
-            self.process_static_draft_outputs(draft_outputs, draft_batch,
-                                              req_id_to_old_request)
+            self.drafter.process_static_draft_outputs(draft_outputs,
+                                                      draft_batch,
+                                                      req_id_to_old_request)
         elif draft_outputs is not None:
-            self._process_dynamic_draft_outputs(scheduled_batch, draft_outputs,
-                                                req_id_to_old_request)
+            self.drafter.process_dynamic_draft_outputs(draft_outputs,
+                                                       req_id_to_old_request)
 
-    def process_static_draft_outputs(self, draft_outputs, draft_batch,
-                                     req_id_to_old_request):
-        """
-        Process outputs from static draft loop.
-
-        Args:
-            draft_outputs: The outputs from the draft model
-            draft_batch: The draft batch that was processed
-            req_id_to_old_request: Mapping from request ID to original request
-        """
-        self.drafter.process_static_draft_outputs(draft_outputs, draft_batch,
-                                                  req_id_to_old_request)
-
-    def _process_dynamic_draft_outputs(self, scheduled_batch, draft_outputs,
-                                       req_id_to_old_request):
-        """
-        Process outputs from dynamic draft loop.
-
-        Args:
-            scheduled_batch: The scheduled batch
-            draft_outputs: The outputs from the draft model
-            req_id_to_old_request: Mapping from request ID to original request
-        """
-        self.drafter.update_requests(draft_outputs)
-        self.drafter.process_decoded_tokens(draft_outputs.scheduled_requests,
-                                            req_id_to_old_request)
-
-        # Rollback draft tokens if guided decoder is available
+        # Pad draft tokens to the max draft length. This is for CUDA graph compatibility.
+        self.drafter.pad_draft_tokens_for_cuda_graph(scheduled_batch)
+        # add_batch must be called again to restore to target requests with updated draft tokens.
         if self.guided_decoder is not None:
             self.guided_decoder.add_batch(scheduled_batch)
             if hasattr(self.drafter, "guided_decoder"):
                 self.guided_decoder.rollback_draft_tokens()
 
-        self.drafter.pad_draft_tokens_for_cuda_graph(scheduled_batch)
-
 
 class DisaggPPTerminationHandler:
     """Handles termination synchronization across pipeline parallel ranks under disaggregated serving.
diff --git a/tensorrt_llm/_torch/speculative/model_drafter.py b/tensorrt_llm/_torch/speculative/model_drafter.py
@@ -178,8 +178,8 @@ def _add_to_draft_batch(self, draft_batch: ScheduledRequests,
         else:
             draft_batch.context_requests.append(draft_request)
 
-    @nvtx_range("prepare_draft_batch")
-    def prepare_draft_batch(
+    @nvtx_range("_prepare_draft_batch")
+    def _prepare_draft_batch(
             self, scheduled_requests: ScheduledRequests) -> ScheduledRequests:
         """
         Prepares a batch for the draft model engine. Draft tokens are only produced
@@ -238,7 +238,7 @@ def prepare_draft_batch(
             return draft_batch
 
         except Exception as e:
-            logger.error(f"Error in prepare_draft_batch: {str(e)}")
+            logger.error(f"Error in _prepare_draft_batch: {str(e)}")
             traceback.print_exc()
             raise e
 
@@ -461,7 +461,7 @@ def _setup_draft_batch_and_resources(
         if guided_decoder is not None:
             guided_decoder.rollback_rejected_tokens(scheduled_batch)
 
-        draft_batch = self.prepare_draft_batch(scheduled_batch)
+        draft_batch = self._prepare_draft_batch(scheduled_batch)
         if draft_batch.batch_size == 0:
             return None, None
 
@@ -482,7 +482,7 @@ def process_static_draft_outputs(
         Args:
             outputs: The outputs from the draft model
             draft_batch: The draft batch that was processed
-            req_id_to_old_request: Mapping from request ID to original request
+            req_id_to_old_request: Mapping from draft request ID to original request
         """
         outputs_host = outputs.cpu()
         for token_idx in range(self.max_draft_tokens):
@@ -500,25 +500,24 @@ def process_static_draft_outputs(
         for req in draft_batch.all_requests():
             self.draft_seq_slot_manager.free_resources(req)
 
+    def process_dynamic_draft_outputs(
+            self, outputs: Any,
+            req_id_to_old_request: Dict[int, LlmRequest]) -> None:
+        """
+        Process outputs from dynamic draft loop, update target requests, and clean up resources.
+        """
+        self.update_requests(outputs)
+        self.process_decoded_tokens(outputs.scheduled_requests,
+                                    req_id_to_old_request)
+
     def _execute_draft_iteration(
         self,
         draft_batch: ScheduledRequests,
         resource_manager: ResourceManager,
         previous_draft_state: Optional[SampleState],
         guided_decoder: Optional[GuidedDecoder] = None
     ) -> Tuple[Any, Optional[SampleState]]:
-        """
-        Execute a single draft iteration.
-
-        Args:
-            draft_batch: The draft batch to process
-            resource_manager: The resource manager
-            previous_draft_state: The previous draft state
-            guided_decoder: The guided decoder
-
-        Returns:
-            Tuple of (outputs, new_sample_state)
-        """
+        """Forward pass through the draft model."""
         outputs = self.forward_draft_model(
             draft_batch,
             resource_manager,
@@ -586,7 +585,7 @@ def _execute_draft_loop(
                 self._update_target_inputs_with_draft_tokens(
                     target_inputs,
                     draft_tensors,
-                    i + 1,
+                    draft_position=i + 1,
                     draft_length=1,
                     num_draft_reqs=num_draft_reqs)
 
@@ -659,7 +658,7 @@ def generate_draft_tokens_with_overlap(
             self._update_target_inputs_with_draft_tokens(
                 target_inputs,
                 outputs,
-                0,
+                draft_position=0,
                 draft_length=self.max_draft_tokens,
                 num_draft_reqs=num_draft_reqs)
             return target_inputs, outputs, draft_batch
@@ -677,7 +676,7 @@ def generate_draft_tokens_with_overlap(
         self._update_target_inputs_with_draft_tokens(
             target_inputs,
             draft_tensors,
-            0,
+            draft_position=0,
             draft_length=1,
             num_draft_reqs=num_draft_reqs)
 
@@ -726,7 +725,6 @@ def prepare_draft_tokens(
                                                   req_id_to_old_request)
                 return
 
-            # Handle guided decoder and sampling for non-static loop
             if self.guided_decoder is not None:
                 self.guided_decoder.add_batch(draft_batch)
                 self.guided_decoder.execute(outputs['logits'],
@@ -741,10 +739,8 @@ def prepare_draft_tokens(
 
             # Final cleanup
             if previous_draft_state is not None:
-                self.update_requests(previous_draft_state)
-                self.process_decoded_tokens(
-                    previous_draft_state.scheduled_requests,
-                    req_id_to_old_request)
+                self.process_dynamic_draft_outputs(previous_draft_state,
+                                                   req_id_to_old_request)
 
         except Exception as e:
             traceback.print_exc()