huggingface · regisss · Jul 10, 2025 · Jan 17, 2025 · Jan 24, 2025 · Feb 18, 2025
@@ -36,7 +36,7 @@ style: clean
 # Run unit and integration tests
 fast_tests:
 	python -m pip install .[tests]
-	python -m pytest tests/test_gaudi_configuration.py tests/test_trainer_distributed.py tests/test_trainer.py tests/test_trainer_seq2seq.py
+	python -m pytest tests/test_gaudi_configuration.py tests/test_trainer_distributed.py tests/test_trainer.py tests/test_trainer_seq2seq.py tests/test_habana_profiler_unit.py
 # TODO enable when CI has more servers
 #	python -m pytest test_functional_text_generation_example.py
 
@@ -89,11 +89,15 @@ slow_tests_1x: test_installs
 	python -m pip install peft==0.10.0; \
 	python -m pytest tests/test_peft_inference.py || status2=$$?; \
 	python -m pytest tests/test_pipeline.py || status3=$$?; \
-	exit $$((status1 + status2 + status3))
+	python -m pytest tests/test_habana_profiler_integration.py -v -s -m "not x8" || status4=$$?; \
+	exit $$((status1 + status2 + status3 + status4))
 
 # Run multi-card non-regression tests
 slow_tests_8x: test_installs
-	DATA_CACHE=$(DATA_CACHE) python -m pytest tests/test_examples.py -v -s -k "multi_card"
+	@status1=0; status2=0; \
+	DATA_CACHE=$(DATA_CACHE) python -m pytest tests/test_examples.py -v -s -k "multi_card" || status1=$$?; \
+	python -m pytest tests/test_habana_profiler_integration.py -v -s -m x8 || status2=$$?; \
+	exit $$((status1 + status2))
 
 # Run DeepSpeed non-regression tests
 slow_tests_deepspeed: test_installs

@@ -547,13 +547,25 @@ def parse_args(input_args=None):
         "--profiling_warmup_steps",
         default=0,
         type=int,
-        help="Number of steps to ignore for profiling.",
+        help="Number of training steps to ignore for profiling.",
     )
     parser.add_argument(
         "--profiling_steps",
         default=0,
         type=int,
-        help="Number of steps to capture for profiling.",
+        help="Number of training steps to capture for profiling.",
+    )
+    parser.add_argument(
+        "--profiling_warmup_steps_eval",
+        default=0,
+        type=int,
+        help="Number of inference steps to ignore for profiling.",
+    )
+    parser.add_argument(
+        "--profiling_steps_eval",
+        default=0,
+        type=int,
+        help="Number of inference steps to capture for profiling.",
     )
     parser.add_argument(
         "--logging_step",
@@ -1153,9 +1165,7 @@ def unwrap_model(model, training=False):
 
     unwrap_model(model=unet, training=True)
     hb_profiler = HabanaProfile(
-        warmup=args.profiling_warmup_steps,
-        active=args.profiling_steps,
-        record_shapes=False,
+        warmup=args.profiling_warmup_steps, active=args.profiling_steps, record_shapes=False, name="train"
     )
     # Train!
     total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
@@ -1521,6 +1531,8 @@ def compute_time_ids(original_size, crops_coords_top_left):
                         args.validation_prompt,
                         num_inference_steps=25,
                         generator=generator,
+                        profiling_warmup_steps=args.profiling_warmup_steps_eval,
+                        profiling_steps=args.profiling_steps_eval,
                     ).images[0]
                     for _ in range(args.num_validation_images)
                 ]

@@ -157,6 +157,11 @@ def setup_parser(parser):
         action="store_true",
         help="Record shapes when enabling profiling.",
     )
+    parser.add_argument(
+        "--profile_whole_sequences",
+        action="store_true",
+        help="When set, profiling step means generation of one whole sequence (not one token).",
+    )
     parser.add_argument(
         "--prompt",
         default=None,
@@ -486,10 +491,24 @@ def main():
 
     import habana_frameworks.torch.hpu as torch_hpu
 
+    from optimum.habana.utils import HabanaGenerationTime, HabanaProfile, get_hpu_memory_stats
+
     if args.sdp_on_bf16:
         torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
 
-    from optimum.habana.utils import HabanaGenerationTime, get_hpu_memory_stats
+    active_profiler = HabanaProfile(
+        warmup=args.profiling_warmup_steps,
+        active=args.profiling_steps,
+        record_shapes=args.profiling_record_shapes,
+        name="generate",
+    )
+    disabled_profiler = HabanaProfile()
+    if args.profile_whole_sequences:
+        per_sequence_profiler = active_profiler
+        per_token_profiler = disabled_profiler
+    else:
+        per_sequence_profiler = disabled_profiler
+        per_token_profiler = active_profiler
 
     if args.dataset_name == "mlcommons":
         # Benchmark over the prompts below
@@ -708,8 +727,9 @@ def assemble_prompt(prompt_size, book_path):
         elif args.batch_size < len(input_sentences):
             input_sentences = input_sentences[: args.batch_size]
 
-        def generate(size=None, reduce_recompile=False):
+        def generate(size=None, reduce_recompile=False, disable_profiling=False):
             """Generates sequences from the input sentences and returns them."""
+            profiler = disabled_profiler if disable_profiling else per_token_profiler
             timer = HabanaGenerationTime()
             timer.start()
             # Tokenization
@@ -770,11 +790,9 @@ def compute_valid_sequence_lengths_tensor(input_tokens):
                 assistant_model=assistant_model,
                 lazy_mode=use_lazy_mode,
                 hpu_graphs=args.use_hpu_graphs,
-                profiling_steps=args.profiling_steps,
-                profiling_warmup_steps=args.profiling_warmup_steps,
                 ignore_eos=args.ignore_eos,
                 iteration_times=iteration_times,
-                profiling_record_shapes=args.profiling_record_shapes,
+                profiler=profiler,
             ).cpu()
             timer.step()
             first_token_time = iteration_times[0] + encode_duration
@@ -790,10 +808,6 @@ def compute_valid_sequence_lengths_tensor(input_tokens):
                 e2e_latency,
             )
 
-        from optimum.habana.utils import HabanaProfile
-
-        # compilation stage disable profiling
-        HabanaProfile.disable()
         # Compilation
         logger.info("Graph compilation...")
         dyn_prompt_lens = args.simulate_dyn_prompt
@@ -804,10 +818,10 @@ def compute_valid_sequence_lengths_tensor(input_tokens):
             for i in range(args.warmup):
                 if dyn_prompt_lens is None:
                     print(f"Warming up iteration {i + 1}/{args.warmup}", flush=True)
-                    generate(None, args.reduce_recompile)
+                    generate(None, args.reduce_recompile, disable_profiling=True)
                 else:
                     print(f"Warming up for shape {dyn_prompt_lens[0]} iteration {i + 1}/{args.warmup}", flush=True)
-                    generate(dyn_prompt_lens[0], args.reduce_recompile)
+                    generate(dyn_prompt_lens[0], args.reduce_recompile, disable_profiling=True)
         else:
             if args.bucket_size > 0:
                 mn = min(dyn_prompt_lens)
@@ -822,24 +836,25 @@ def rounder(x):
                     lst = list(range(min_prompt_len, max_sentence_len + 1, args.bucket_size))
                     for sz in lst:
                         print(f"Warming up for shape {sz - 1} iteration {i + 1}/{args.warmup}", flush=True)
-                        generate(sz - 1, args.reduce_recompile)
+                        generate(sz - 1, args.reduce_recompile, disable_profiling=True)
         torch_hpu.synchronize()
         timer.step()
         compilation_duration = timer.last_duration
-        HabanaProfile.enable()
         total_new_tokens_generated = 0
         logger.info("Running generate...")
         first_token_latencies = []
         rest_token_latencies = []
         e2e_latencies = []
         timer.step()
         # Benchmark over n_iterations iterations
+        per_sequence_profiler.start()
         if dyn_prompt_lens is None:
             for i in range(args.n_iterations):
                 generated, first_token_time, rest_token_time, e2e_latency = generate(None, args.reduce_recompile)
                 first_token_latencies.append(first_token_time)
                 rest_token_latencies.append(rest_token_time)
                 e2e_latencies.append(e2e_latency)
+                per_sequence_profiler.step()
         else:
             repeated_prompt_len = cycle(dyn_prompt_lens)
             for i in range(args.n_iterations):
@@ -849,9 +864,11 @@ def rounder(x):
                 first_token_latencies.append(first_token_time)
                 rest_token_latencies.append(rest_token_time)
                 e2e_latencies.append(e2e_latency)
+                per_sequence_profiler.step()
         timer.step()
         logger.info("Finished running generate")
         duration = timer.last_duration
+        per_sequence_profiler.stop()
         total_new_tokens_generated = args.n_iterations * args.batch_size * args.max_new_tokens
         throughput = total_new_tokens_generated / duration
         # Calculate average latencies
@@ -983,7 +1000,9 @@ def collate_fn(data):
 
         dataloader = DataLoader(raw_dataset, batch_size=args.batch_size, collate_fn=collate_fn)
 
-        def generate_dataset(batch):
+        def generate_dataset(batch, disable_profiling=False):
+            profiler = disabled_profiler if disable_profiling else per_token_profiler
+
             prompt = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)
             # Move inputs to target device(s)
             for t in batch:
@@ -995,18 +1014,11 @@ def generate_dataset(batch):
                 generation_config=generation_config,
                 lazy_mode=use_lazy_mode,
                 hpu_graphs=args.use_hpu_graphs,
-                profiling_steps=args.profiling_steps,
-                profiling_warmup_steps=args.profiling_warmup_steps,
                 ignore_eos=args.ignore_eos,
-                profiling_record_shapes=args.profiling_record_shapes,
+                profiler=profiler,
             ).cpu()
             return prompt, outputs
 
-        # warmup
-        from optimum.habana.utils import HabanaProfile
-
-        # compilation stage disable profiling
-        HabanaProfile.disable()
         # Compilation
         logger.info("Graph compilation...")
         timer = HabanaGenerationTime()
@@ -1022,14 +1034,15 @@ def generate_dataset(batch):
         torch_hpu.synchronize()
         timer.step()
         compilation_duration = timer.last_duration
-        HabanaProfile.enable()
-
         total_new_tokens_generated = 0
         duration = 0
         separator = "-" * 50
         logger.info("Running generate dataset...")
+
         timer = HabanaGenerationTime()
         timer.start()
+        per_sequence_profiler.start()
+
         for i, batch in enumerate(dataloader):
             timer.step()
             prompt, outputs = generate_dataset(batch)
@@ -1045,7 +1058,9 @@ def generate_dataset(batch):
             print(separator)
             if args.run_partial_dataset and args.n_iterations == i + 1:
                 break
+            per_sequence_profiler.step()
         timer.step()
+        per_sequence_profiler.stop()
 
         throughput = total_new_tokens_generated / duration
         # Print Stats

@@ -500,6 +500,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="diffuser_pipeline",
             )
             hb_profiler.start()
 

@@ -470,6 +470,7 @@ def __call__(
             warmup=profiling_warmup_steps,
             active=profiling_steps,
             record_shapes=False,
+            name="diffuser_pipeline",
         )
         hb_profiler.start()
 

@@ -502,6 +502,7 @@ def __call__(
             warmup=profiling_warmup_steps,
             active=profiling_steps,
             record_shapes=False,
+            name="diffuser_pipeline",
         )
         hb_profiler.start()
 

@@ -494,6 +494,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
             hb_profiler.start()
 

@@ -309,6 +309,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
             hb_profiler.start()
 

@@ -513,6 +513,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
             hb_profiler.start()
 

@@ -392,6 +392,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
             hb_profiler.start()
 

@@ -542,6 +542,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
 
             hb_profiler.start()

@@ -662,6 +662,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
             hb_profiler.start()
 

@@ -502,6 +502,7 @@ def denoising_value_valid(dnv):
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
             hb_profiler.start()
 

@@ -756,6 +756,7 @@ def __call__(
             warmup=profiling_warmup_steps,
             active=profiling_steps,
             record_shapes=False,
+            name="stable_diffusion",
         )
         hb_profiler.start()