@@ -1400,18 +1400,17 @@ def _executor_loop_overlap(self):
14001400 self .guided_decoder .add_batch (scheduled_batch )
14011401 self .guided_decoder .init_disagg_gen_requests ()
14021402
1403- previous_tensors = self .previous_batch and self .previous_batch .sample_state
1404- # If there are previous draft tokens, we need to update the target requests to accept some draft tokens.
1405- # When there's any accepted tokens, we can't directly use the previous batch's outputs in this iteration for the target model,
1406- # so we'll set the target model's input to None and skip updating the target requests after target model forward.
1407- use_previous_draft_tokens = self .has_previous_draft_tokens
1408- if self .drafter is not None and (self .use_spec_decode
1409- or use_previous_draft_tokens ):
1410- target_inputs = self ._handle_speculative_decoding (
1411- scheduled_batch , previous_tensors ,
1412- previous_tensors_device )
1403+ previous_tensors = self .previous_batch and self .previous_batch .sample_state
1404+ # If there are previous draft tokens, we need to update the target requests to accept some draft tokens.
1405+ # When there's any accepted tokens, we can't directly use the previous batch's outputs in this iteration for the target model,
1406+ # so we'll set the target model's input to None and skip updating the target requests after target model forward.
1407+ use_previous_draft_tokens = self .has_previous_draft_tokens
1408+ if self .drafter is not None and (self .use_spec_decode or
1409+ use_previous_draft_tokens ):
1410+ target_inputs = self ._handle_speculative_decoding (
1411+ scheduled_batch , previous_tensors ,
1412+ previous_tensors_device )
14131413
1414- if can_queue :
14151414 # Use the draft_model's outputs if we've launched the draft model.
14161415 # Otherwise, use the previous batch's outputs.
14171416 if (target_inputs is not None
0 commit comments