Skip to content

Commit 6348945

Browse files
committed
fixup! [None][feat] Enable early exit with overlap scheduler
Signed-off-by: Robin Kobus <[email protected]>
1 parent 135648f commit 6348945

File tree

1 file changed

+10
-11
lines changed

1 file changed

+10
-11
lines changed

tensorrt_llm/_torch/pyexecutor/py_executor.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1400,18 +1400,17 @@ def _executor_loop_overlap(self):
14001400
self.guided_decoder.add_batch(scheduled_batch)
14011401
self.guided_decoder.init_disagg_gen_requests()
14021402

1403-
previous_tensors = self.previous_batch and self.previous_batch.sample_state
1404-
# If there are previous draft tokens, we need to update the target requests to accept some draft tokens.
1405-
# When there's any accepted tokens, we can't directly use the previous batch's outputs in this iteration for the target model,
1406-
# so we'll set the target model's input to None and skip updating the target requests after target model forward.
1407-
use_previous_draft_tokens = self.has_previous_draft_tokens
1408-
if self.drafter is not None and (self.use_spec_decode
1409-
or use_previous_draft_tokens):
1410-
target_inputs = self._handle_speculative_decoding(
1411-
scheduled_batch, previous_tensors,
1412-
previous_tensors_device)
1403+
previous_tensors = self.previous_batch and self.previous_batch.sample_state
1404+
# If there are previous draft tokens, we need to update the target requests to accept some draft tokens.
1405+
# When there's any accepted tokens, we can't directly use the previous batch's outputs in this iteration for the target model,
1406+
# so we'll set the target model's input to None and skip updating the target requests after target model forward.
1407+
use_previous_draft_tokens = self.has_previous_draft_tokens
1408+
if self.drafter is not None and (self.use_spec_decode or
1409+
use_previous_draft_tokens):
1410+
target_inputs = self._handle_speculative_decoding(
1411+
scheduled_batch, previous_tensors,
1412+
previous_tensors_device)
14131413

1414-
if can_queue:
14151414
# Use the draft_model's outputs if we've launched the draft model.
14161415
# Otherwise, use the previous batch's outputs.
14171416
if (target_inputs is not None

0 commit comments

Comments
 (0)