fixup! [None][feat] Enable early exit with overlap scheduler

Funatiq · Funatiq · commit 6348945f234d · 2025-11-16T13:36:50.000Z
Signed-off-by: Robin Kobus &lt;19427718+Funatiq@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -1400,18 +1400,17 @@ def _executor_loop_overlap(self):
                         self.guided_decoder.add_batch(scheduled_batch)
                         self.guided_decoder.init_disagg_gen_requests()
 
-                previous_tensors = self.previous_batch and self.previous_batch.sample_state
-                # If there are previous draft tokens, we need to update the target requests to accept some draft tokens.
-                # When there's any accepted tokens, we can't directly use the previous batch's outputs in this iteration for the target model,
-                # so we'll set the target model's input to None and skip updating the target requests after target model forward.
-                use_previous_draft_tokens = self.has_previous_draft_tokens
-                if self.drafter is not None and (self.use_spec_decode
-                                                 or use_previous_draft_tokens):
-                    target_inputs = self._handle_speculative_decoding(
-                        scheduled_batch, previous_tensors,
-                        previous_tensors_device)
+                    previous_tensors = self.previous_batch and self.previous_batch.sample_state
+                    # If there are previous draft tokens, we need to update the target requests to accept some draft tokens.
+                    # When there's any accepted tokens, we can't directly use the previous batch's outputs in this iteration for the target model,
+                    # so we'll set the target model's input to None and skip updating the target requests after target model forward.
+                    use_previous_draft_tokens = self.has_previous_draft_tokens
+                    if self.drafter is not None and (self.use_spec_decode or
+                                                     use_previous_draft_tokens):
+                        target_inputs = self._handle_speculative_decoding(
+                            scheduled_batch, previous_tensors,
+                            previous_tensors_device)
 
-                if can_queue:
                     # Use the draft_model's outputs if we've launched the draft model.
                     # Otherwise, use the previous batch's outputs.
                     if (target_inputs is not None