From b322688bda76c229a1b95f4c01727b8b2f9a0eaf Mon Sep 17 00:00:00 2001
From: Libin Tang <litang@habana.ai>
Date: Sun, 23 Jun 2024 06:01:15 +0000
Subject: [PATCH] Move the 1st token finish time to not include 2nd step kv pad
 time.

---
 .../habana/transformers/generation/utils.py   | 33 +++++++++----------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
index c858ef1e27..987a5bbba8 100755
--- a/optimum/habana/transformers/generation/utils.py
+++ b/optimum/habana/transformers/generation/utils.py
@@ -1863,6 +1863,14 @@ def _greedy_search(
                     input_ids, scores, token_idx=cur_len, ignore_eos=ignore_eos, eos_token_id=eos_token_id
                 )
                 this_peer_finished = unfinished_sequences.max() == 0
+            hb_profer.step()
+            if hb_gen_time is not None:
+                if not time_to_first_token_done:
+                    time_to_first_token_done = True
+                    import habana_frameworks.torch.hpu as torch_hpu
+
+                    torch_hpu.synchronize()
+                hb_gen_time.step()
 
             if (
                 not model_kwargs.get("pad_done", False)
@@ -1873,14 +1881,6 @@ def _greedy_search(
                 # before starting the decode phase.
                 self._pad_past_key_values(model_kwargs)
                 model_kwargs["pad_done"] = True
-            hb_profer.step()
-            if hb_gen_time is not None:
-                if not time_to_first_token_done:
-                    time_to_first_token_done = True
-                    import habana_frameworks.torch.hpu as torch_hpu
-
-                    torch_hpu.synchronize()
-                hb_gen_time.step()
 
         if (
             model_kwargs.get("use_hpu_graphs", False)
@@ -2282,6 +2282,14 @@ def _sample(
                     input_ids, scores, token_idx=cur_len, ignore_eos=ignore_eos, eos_token_id=eos_token_id
                 )
                 this_peer_finished = unfinished_sequences.max() == 0
+            hb_profer.step()
+            if hb_gen_time is not None:
+                if not time_to_first_token_done:
+                    time_to_first_token_done = True
+                    import habana_frameworks.torch.hpu as torch_hpu
+
+                    torch_hpu.synchronize()
+                hb_gen_time.step()
 
             if (
                 not model_kwargs.get("pad_done", False)
@@ -2293,15 +2301,6 @@ def _sample(
                 self._pad_past_key_values(model_kwargs)
                 model_kwargs["pad_done"] = True
 
-            hb_profer.step()
-            if hb_gen_time is not None:
-                if not time_to_first_token_done:
-                    time_to_first_token_done = True
-                    import habana_frameworks.torch.hpu as torch_hpu
-
-                    torch_hpu.synchronize()
-                hb_gen_time.step()
-
         if (
             model_kwargs.get("use_hpu_graphs", False)
             and model_kwargs.get("limit_hpu_graphs", False)