Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ style: clean
# Run unit and integration tests
fast_tests:
python -m pip install .[tests]
python -m pytest tests/test_gaudi_configuration.py tests/test_trainer_distributed.py tests/test_trainer.py tests/test_trainer_seq2seq.py
python -m pytest tests/test_gaudi_configuration.py tests/test_trainer_distributed.py tests/test_trainer.py tests/test_trainer_seq2seq.py tests/test_habana_profiler_unit.py
Comment thread
regisss marked this conversation as resolved.
# TODO enable when CI has more servers
# python -m pytest test_functional_text_generation_example.py

Expand Down Expand Up @@ -89,11 +89,15 @@ slow_tests_1x: test_installs
python -m pip install peft==0.10.0; \
python -m pytest tests/test_peft_inference.py || status2=$$?; \
python -m pytest tests/test_pipeline.py || status3=$$?; \
exit $$((status1 + status2 + status3))
python -m pytest tests/test_habana_profiler_integration.py -v -s -m "not x8" || status4=$$?; \
exit $$((status1 + status2 + status3 + status4))

# Run multi-card non-regression tests
slow_tests_8x: test_installs
DATA_CACHE=$(DATA_CACHE) python -m pytest tests/test_examples.py -v -s -k "multi_card"
@status1=0; status2=0; \
DATA_CACHE=$(DATA_CACHE) python -m pytest tests/test_examples.py -v -s -k "multi_card" || status1=$$?; \
python -m pytest tests/test_habana_profiler_integration.py -v -s -m x8 || status2=$$?; \
exit $$((status1 + status2))

# Run DeepSpeed non-regression tests
slow_tests_deepspeed: test_installs
Expand Down
22 changes: 17 additions & 5 deletions examples/stable-diffusion/training/train_text_to_image_sdxl.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,13 +547,25 @@ def parse_args(input_args=None):
"--profiling_warmup_steps",
default=0,
type=int,
help="Number of steps to ignore for profiling.",
help="Number of training steps to ignore for profiling.",
)
parser.add_argument(
"--profiling_steps",
default=0,
type=int,
help="Number of steps to capture for profiling.",
help="Number of training steps to capture for profiling.",
)
parser.add_argument(
"--profiling_warmup_steps_eval",
default=0,
type=int,
help="Number of inference steps to ignore for profiling.",
)
parser.add_argument(
"--profiling_steps_eval",
default=0,
type=int,
help="Number of inference steps to capture for profiling.",
)
parser.add_argument(
"--logging_step",
Expand Down Expand Up @@ -1153,9 +1165,7 @@ def unwrap_model(model, training=False):

unwrap_model(model=unet, training=True)
hb_profiler = HabanaProfile(
warmup=args.profiling_warmup_steps,
active=args.profiling_steps,
record_shapes=False,
warmup=args.profiling_warmup_steps, active=args.profiling_steps, record_shapes=False, name="train"
)
# Train!
total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
Expand Down Expand Up @@ -1521,6 +1531,8 @@ def compute_time_ids(original_size, crops_coords_top_left):
args.validation_prompt,
num_inference_steps=25,
generator=generator,
profiling_warmup_steps=args.profiling_warmup_steps_eval,
profiling_steps=args.profiling_steps_eval,
).images[0]
for _ in range(args.num_validation_images)
]
Expand Down
63 changes: 39 additions & 24 deletions examples/text-generation/run_generation.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,11 @@ def setup_parser(parser):
action="store_true",
help="Record shapes when enabling profiling.",
)
parser.add_argument(
"--profile_whole_sequences",
action="store_true",
help="When set, profiling step means generation of one whole sequence (not one token).",
)
parser.add_argument(
"--prompt",
default=None,
Expand Down Expand Up @@ -486,10 +491,24 @@ def main():

import habana_frameworks.torch.hpu as torch_hpu

from optimum.habana.utils import HabanaGenerationTime, HabanaProfile, get_hpu_memory_stats

if args.sdp_on_bf16:
torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)

from optimum.habana.utils import HabanaGenerationTime, get_hpu_memory_stats
active_profiler = HabanaProfile(
warmup=args.profiling_warmup_steps,
active=args.profiling_steps,
record_shapes=args.profiling_record_shapes,
name="generate",
)
disabled_profiler = HabanaProfile()
if args.profile_whole_sequences:
per_sequence_profiler = active_profiler
per_token_profiler = disabled_profiler
else:
per_sequence_profiler = disabled_profiler
per_token_profiler = active_profiler

if args.dataset_name == "mlcommons":
# Benchmark over the prompts below
Expand Down Expand Up @@ -708,8 +727,9 @@ def assemble_prompt(prompt_size, book_path):
elif args.batch_size < len(input_sentences):
input_sentences = input_sentences[: args.batch_size]

def generate(size=None, reduce_recompile=False):
def generate(size=None, reduce_recompile=False, disable_profiling=False):
"""Generates sequences from the input sentences and returns them."""
profiler = disabled_profiler if disable_profiling else per_token_profiler
timer = HabanaGenerationTime()
timer.start()
# Tokenization
Expand Down Expand Up @@ -770,11 +790,9 @@ def compute_valid_sequence_lengths_tensor(input_tokens):
assistant_model=assistant_model,
lazy_mode=use_lazy_mode,
hpu_graphs=args.use_hpu_graphs,
profiling_steps=args.profiling_steps,
profiling_warmup_steps=args.profiling_warmup_steps,
ignore_eos=args.ignore_eos,
iteration_times=iteration_times,
profiling_record_shapes=args.profiling_record_shapes,
profiler=profiler,
).cpu()
timer.step()
first_token_time = iteration_times[0] + encode_duration
Expand All @@ -790,10 +808,6 @@ def compute_valid_sequence_lengths_tensor(input_tokens):
e2e_latency,
)

from optimum.habana.utils import HabanaProfile

# compilation stage disable profiling
HabanaProfile.disable()
# Compilation
logger.info("Graph compilation...")
dyn_prompt_lens = args.simulate_dyn_prompt
Expand All @@ -804,10 +818,10 @@ def compute_valid_sequence_lengths_tensor(input_tokens):
for i in range(args.warmup):
if dyn_prompt_lens is None:
print(f"Warming up iteration {i + 1}/{args.warmup}", flush=True)
generate(None, args.reduce_recompile)
generate(None, args.reduce_recompile, disable_profiling=True)
else:
print(f"Warming up for shape {dyn_prompt_lens[0]} iteration {i + 1}/{args.warmup}", flush=True)
generate(dyn_prompt_lens[0], args.reduce_recompile)
generate(dyn_prompt_lens[0], args.reduce_recompile, disable_profiling=True)
else:
if args.bucket_size > 0:
mn = min(dyn_prompt_lens)
Expand All @@ -822,24 +836,25 @@ def rounder(x):
lst = list(range(min_prompt_len, max_sentence_len + 1, args.bucket_size))
for sz in lst:
print(f"Warming up for shape {sz - 1} iteration {i + 1}/{args.warmup}", flush=True)
generate(sz - 1, args.reduce_recompile)
generate(sz - 1, args.reduce_recompile, disable_profiling=True)
torch_hpu.synchronize()
timer.step()
compilation_duration = timer.last_duration
HabanaProfile.enable()
total_new_tokens_generated = 0
logger.info("Running generate...")
first_token_latencies = []
rest_token_latencies = []
e2e_latencies = []
timer.step()
# Benchmark over n_iterations iterations
per_sequence_profiler.start()
if dyn_prompt_lens is None:
for i in range(args.n_iterations):
generated, first_token_time, rest_token_time, e2e_latency = generate(None, args.reduce_recompile)
first_token_latencies.append(first_token_time)
rest_token_latencies.append(rest_token_time)
e2e_latencies.append(e2e_latency)
per_sequence_profiler.step()
else:
repeated_prompt_len = cycle(dyn_prompt_lens)
for i in range(args.n_iterations):
Expand All @@ -849,9 +864,11 @@ def rounder(x):
first_token_latencies.append(first_token_time)
rest_token_latencies.append(rest_token_time)
e2e_latencies.append(e2e_latency)
per_sequence_profiler.step()
timer.step()
logger.info("Finished running generate")
duration = timer.last_duration
per_sequence_profiler.stop()
total_new_tokens_generated = args.n_iterations * args.batch_size * args.max_new_tokens
throughput = total_new_tokens_generated / duration
# Calculate average latencies
Expand Down Expand Up @@ -983,7 +1000,9 @@ def collate_fn(data):

dataloader = DataLoader(raw_dataset, batch_size=args.batch_size, collate_fn=collate_fn)

def generate_dataset(batch):
def generate_dataset(batch, disable_profiling=False):
profiler = disabled_profiler if disable_profiling else per_token_profiler

prompt = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)
# Move inputs to target device(s)
for t in batch:
Expand All @@ -995,18 +1014,11 @@ def generate_dataset(batch):
generation_config=generation_config,
lazy_mode=use_lazy_mode,
hpu_graphs=args.use_hpu_graphs,
profiling_steps=args.profiling_steps,
profiling_warmup_steps=args.profiling_warmup_steps,
ignore_eos=args.ignore_eos,
profiling_record_shapes=args.profiling_record_shapes,
profiler=profiler,
).cpu()
return prompt, outputs

# warmup
from optimum.habana.utils import HabanaProfile

# compilation stage disable profiling
HabanaProfile.disable()
# Compilation
logger.info("Graph compilation...")
timer = HabanaGenerationTime()
Expand All @@ -1022,14 +1034,15 @@ def generate_dataset(batch):
torch_hpu.synchronize()
timer.step()
compilation_duration = timer.last_duration
HabanaProfile.enable()

total_new_tokens_generated = 0
duration = 0
separator = "-" * 50
logger.info("Running generate dataset...")

timer = HabanaGenerationTime()
timer.start()
per_sequence_profiler.start()

for i, batch in enumerate(dataloader):
timer.step()
prompt, outputs = generate_dataset(batch)
Expand All @@ -1045,7 +1058,9 @@ def generate_dataset(batch):
print(separator)
if args.run_partial_dataset and args.n_iterations == i + 1:
break
per_sequence_profiler.step()
timer.step()
per_sequence_profiler.stop()

throughput = total_new_tokens_generated / duration
# Print Stats
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,7 @@ def __call__(
warmup=profiling_warmup_steps,
active=profiling_steps,
record_shapes=False,
name="diffuser_pipeline",
)
hb_profiler.start()

Expand Down
1 change: 1 addition & 0 deletions optimum/habana/diffusers/pipelines/flux/pipeline_flux.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,7 @@ def __call__(
warmup=profiling_warmup_steps,
active=profiling_steps,
record_shapes=False,
name="diffuser_pipeline",
)
hb_profiler.start()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,7 @@ def __call__(
warmup=profiling_warmup_steps,
active=profiling_steps,
record_shapes=False,
name="diffuser_pipeline",
)
hb_profiler.start()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,7 @@ def __call__(
warmup=profiling_warmup_steps,
active=profiling_steps,
record_shapes=False,
name="stable_diffusion",
)
hb_profiler.start()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ def __call__(
warmup=profiling_warmup_steps,
active=profiling_steps,
record_shapes=False,
name="stable_diffusion",
)
hb_profiler.start()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,7 @@ def __call__(
warmup=profiling_warmup_steps,
active=profiling_steps,
record_shapes=False,
name="stable_diffusion",
)
hb_profiler.start()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,7 @@ def __call__(
warmup=profiling_warmup_steps,
active=profiling_steps,
record_shapes=False,
name="stable_diffusion",
)
hb_profiler.start()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,7 @@ def __call__(
warmup=profiling_warmup_steps,
active=profiling_steps,
record_shapes=False,
name="stable_diffusion",
)

hb_profiler.start()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -662,6 +662,7 @@ def __call__(
warmup=profiling_warmup_steps,
active=profiling_steps,
record_shapes=False,
name="stable_diffusion",
)
hb_profiler.start()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,7 @@ def denoising_value_valid(dnv):
warmup=profiling_warmup_steps,
active=profiling_steps,
record_shapes=False,
name="stable_diffusion",
)
hb_profiler.start()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -756,6 +756,7 @@ def __call__(
warmup=profiling_warmup_steps,
active=profiling_steps,
record_shapes=False,
name="stable_diffusion",
)
hb_profiler.start()

Expand Down
Loading
Loading