HabanaAI · shakkedw · Feb 8, 2024 · Feb 1, 2024 · Feb 4, 2024 · Feb 5, 2024
diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
@@ -294,6 +294,8 @@ def main():
         def generate(size=None, reduce_recompile=False):
             """Generates sequences from the input sentences and returns them."""
 
+            t0 = time.perf_counter()
+            print(f"Step4+ starting time is {t0*1000}", flush=True)
             # Tokenization
             if args.max_input_tokens > 0:
                 input_tokens = tokenizer.batch_encode_plus(
@@ -314,15 +316,18 @@ def generate(size=None, reduce_recompile=False):
                     if torch.is_tensor(input_tokens[t]):
                         input_tokens[t] = input_tokens[t].to(args.device)
 
-            outputs = model.generate(
+            output_tokens = model.generate(
                 **input_tokens,
                 generation_config=generation_config,
                 lazy_mode=use_lazy_mode,
                 hpu_graphs=args.use_hpu_graphs,
                 profiling_steps=args.profiling_steps,
                 profiling_warmup_steps=args.profiling_warmup_steps,
             ).cpu()
-            return tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            outputs = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
+            duration = time.perf_counter() - t0
+            print(f"Total E2E time of this iteration is {duration:.3f}s", flush=True)
+            return outputs
 
         from optimum.habana.utils import HabanaProfile
 

@@ -17,6 +17,7 @@
 import copy
 import inspect
 import math
+import time
 import warnings
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
@@ -1385,12 +1386,13 @@ def greedy_search(
         reduce_recompile = model_kwargs["reduce_recompile"]
 
         prompt_len = input_ids.shape[-1]
+
         if not bucket_internal:
             if bucket_size >= 0:
                 inc = iter(incrementor(bucket_size, prompt_len))
             if bucket_size > 0:
                 assert "position_ids" not in model_kwargs, "Untested path"
-
+        greedy_first = True
         while True:
             if lazy_mode:
                 self.htcore_generation.mark_step()
@@ -1512,6 +1514,13 @@ def greedy_search(
 
             hb_profer.step()
 
+            if greedy_first:
+                import habana_frameworks.torch.hpu as torch_hpu
+
+                torch_hpu.synchronize()
+                print(f"First Token time(greedy):{time.perf_counter()*1000}")
+                greedy_first = False
+
             if this_peer_finished and not synced_gpus:
                 break
 
@@ -1730,6 +1739,7 @@ def sample(
         hb_profer = HabanaProfile(warmup=profiling_warmup_steps, active=profiling_steps)
         hb_profer.start()
         this_peer_finished = False  # used by synced_gpus only
+        sample_first = True
         # auto-regressive generation
         while True:
             if lazy_mode:
@@ -1830,6 +1840,13 @@ def sample(
 
             hb_profer.step()
 
+            if sample_first:
+                import habana_frameworks.torch.hpu as torch_hpu
+
+                torch_hpu.synchronize()
+                print(f"First Token time(sample):{time.perf_counter()*1000}")
+                sample_first = False
+
             if this_peer_finished and not synced_gpus:
                 break