Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions examples/text-generation/run_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,8 @@ def main():
def generate(size=None, reduce_recompile=False):
"""Generates sequences from the input sentences and returns them."""

t0 = time.perf_counter()
print(f"Step4+ starting time is {t0*1000}", flush=True)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a need to print starting time ?

# Tokenization
if args.max_input_tokens > 0:
input_tokens = tokenizer.batch_encode_plus(
Expand All @@ -314,15 +316,18 @@ def generate(size=None, reduce_recompile=False):
if torch.is_tensor(input_tokens[t]):
input_tokens[t] = input_tokens[t].to(args.device)

outputs = model.generate(
output_tokens = model.generate(
**input_tokens,
generation_config=generation_config,
lazy_mode=use_lazy_mode,
hpu_graphs=args.use_hpu_graphs,
profiling_steps=args.profiling_steps,
profiling_warmup_steps=args.profiling_warmup_steps,
).cpu()
return tokenizer.batch_decode(outputs, skip_special_tokens=True)
outputs = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
duration = time.perf_counter() - t0
print(f"Total E2E time of this iteration is {duration:.3f}s", flush=True)
return outputs

from optimum.habana.utils import HabanaProfile

Expand Down
19 changes: 18 additions & 1 deletion optimum/habana/transformers/generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import copy
import inspect
import math
import time
import warnings
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union

Expand Down Expand Up @@ -1385,12 +1386,13 @@ def greedy_search(
reduce_recompile = model_kwargs["reduce_recompile"]

prompt_len = input_ids.shape[-1]

if not bucket_internal:
if bucket_size >= 0:
inc = iter(incrementor(bucket_size, prompt_len))
if bucket_size > 0:
assert "position_ids" not in model_kwargs, "Untested path"

greedy_first = True
while True:
if lazy_mode:
self.htcore_generation.mark_step()
Expand Down Expand Up @@ -1512,6 +1514,13 @@ def greedy_search(

hb_profer.step()

if greedy_first:
import habana_frameworks.torch.hpu as torch_hpu

torch_hpu.synchronize()
print(f"First Token time(greedy):{time.perf_counter()*1000}")
greedy_first = False

if this_peer_finished and not synced_gpus:
break

Expand Down Expand Up @@ -1730,6 +1739,7 @@ def sample(
hb_profer = HabanaProfile(warmup=profiling_warmup_steps, active=profiling_steps)
hb_profer.start()
this_peer_finished = False # used by synced_gpus only
sample_first = True
# auto-regressive generation
while True:
if lazy_mode:
Expand Down Expand Up @@ -1830,6 +1840,13 @@ def sample(

hb_profer.step()

if sample_first:
import habana_frameworks.torch.hpu as torch_hpu

torch_hpu.synchronize()
print(f"First Token time(sample):{time.perf_counter()*1000}")
sample_first = False

if this_peer_finished and not synced_gpus:
break

Expand Down