From b322688bda76c229a1b95f4c01727b8b2f9a0eaf Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Sun, 23 Jun 2024 06:01:15 +0000 Subject: [PATCH] Move the 1st token finish time to not include 2nd step kv pad time. --- .../habana/transformers/generation/utils.py | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py index c858ef1e27..987a5bbba8 100755 --- a/optimum/habana/transformers/generation/utils.py +++ b/optimum/habana/transformers/generation/utils.py @@ -1863,6 +1863,14 @@ def _greedy_search( input_ids, scores, token_idx=cur_len, ignore_eos=ignore_eos, eos_token_id=eos_token_id ) this_peer_finished = unfinished_sequences.max() == 0 + hb_profer.step() + if hb_gen_time is not None: + if not time_to_first_token_done: + time_to_first_token_done = True + import habana_frameworks.torch.hpu as torch_hpu + + torch_hpu.synchronize() + hb_gen_time.step() if ( not model_kwargs.get("pad_done", False) @@ -1873,14 +1881,6 @@ def _greedy_search( # before starting the decode phase. self._pad_past_key_values(model_kwargs) model_kwargs["pad_done"] = True - hb_profer.step() - if hb_gen_time is not None: - if not time_to_first_token_done: - time_to_first_token_done = True - import habana_frameworks.torch.hpu as torch_hpu - - torch_hpu.synchronize() - hb_gen_time.step() if ( model_kwargs.get("use_hpu_graphs", False) @@ -2282,6 +2282,14 @@ def _sample( input_ids, scores, token_idx=cur_len, ignore_eos=ignore_eos, eos_token_id=eos_token_id ) this_peer_finished = unfinished_sequences.max() == 0 + hb_profer.step() + if hb_gen_time is not None: + if not time_to_first_token_done: + time_to_first_token_done = True + import habana_frameworks.torch.hpu as torch_hpu + + torch_hpu.synchronize() + hb_gen_time.step() if ( not model_kwargs.get("pad_done", False) @@ -2293,15 +2301,6 @@ def _sample( self._pad_past_key_values(model_kwargs) model_kwargs["pad_done"] = True - hb_profer.step() - if hb_gen_time is not None: - if not time_to_first_token_done: - time_to_first_token_done = True - import habana_frameworks.torch.hpu as torch_hpu - - torch_hpu.synchronize() - hb_gen_time.step() - if ( model_kwargs.get("use_hpu_graphs", False) and model_kwargs.get("limit_hpu_graphs", False)