From 4de3a7fd0d3bdb11e38653fefb62aff9047567ae Mon Sep 17 00:00:00 2001
From: Urszula Golowicz <urszula.golowicz@intel.com>
Date: Fri, 17 Jan 2025 18:29:08 +0200
Subject: [PATCH 1/6] Profiling can be run more than once

Profiling can be started more than once, at different points
in the course of running an example, e.g. both during training
and during evaluation.

Signed-off-by: Urszula Golowicz <urszula.golowicz@intel.com>
---
 Makefile                      |   2 +-
 optimum/habana/utils.py       |  70 +++++++------------
 tests/test_habana_profiler.py | 125 ++++++++++++++++++++++++++++++++++
 3 files changed, 149 insertions(+), 48 deletions(-)
 create mode 100644 tests/test_habana_profiler.py

diff --git a/Makefile b/Makefile
index 2ef5553a23..ded7c6e0ff 100644
--- a/Makefile
+++ b/Makefile
@@ -34,7 +34,7 @@ style: clean
 # Run unit and integration tests
 fast_tests:
 	python -m pip install .[tests]
-	python -m pytest tests/test_gaudi_configuration.py tests/test_trainer_distributed.py tests/test_trainer.py tests/test_trainer_seq2seq.py
+	python -m pytest tests/test_gaudi_configuration.py tests/test_trainer_distributed.py tests/test_trainer.py tests/test_trainer_seq2seq.py tests/test_habana_profiler.py
 # TODO enable when CI has more servers
 #	python -m pytest test_functional_text_generation_example.py
 
diff --git a/optimum/habana/utils.py b/optimum/habana/utils.py
index d47c94ab19..596abae51e 100755
--- a/optimum/habana/utils.py
+++ b/optimum/habana/utils.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import random
 import subprocess
 import time
@@ -290,12 +291,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         self.step()
 
 
-class HabanaProfile(object):
-    """
-    HPU profiler only could be run once, so HABANA_PROFILE_ENABLED, a class static variable shared by all the instances of HabanaProfile, is used to control which part will be captured.
-    """
-
-    HABANA_PROFILE_ENABLED = True
+class HabanaProfile:
+    _profilers = []
 
     def __init__(
         self,
@@ -303,65 +300,44 @@ def __init__(
         active: int = 0,
         record_shapes: bool = True,
         with_stack: bool = False,
+        name: str = "",
         output_dir: str = "./hpu_profile",
         wait: int = 0,
     ):
-        if active <= 0 or warmup < 0 or not HabanaProfile.HABANA_PROFILE_ENABLED:
+        self._profiler = None
+        self._running = False
 
-            def noop():
-                pass
+        if active <= 0:
+            self.start = self.stop = self.step = lambda: None
 
-            self.start = noop
-            self.stop = noop
-            self.step = noop
         else:
-            HabanaProfile.HABANA_PROFILE_ENABLED = False
+            output_dir = os.path.join(output_dir, name)
+
             schedule = torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1)
             activities = [torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.HPU]
-
-            profiler = torch.profiler.profile(
+            self._profiler = torch.profiler.profile(
                 schedule=schedule,
                 activities=activities,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(output_dir),
                 record_shapes=record_shapes,
                 with_stack=with_stack,
             )
-            self.start = profiler.start
-            self.stop = profiler.stop
-            self.step = profiler.step
-            HabanaProfile.enable.invalid = True
-            HabanaProfile.disable.invalid = True
-
-    def stop(self):
-        self.stop()
+            self._profilers.append(self)
 
     def start(self):
-        self.start()
+        if any(p._running for p in self._profilers):
+            raise RuntimeError("Cannot start profiler, another profiler instance is already running")
+        self._running = True
+        self._profiler.start()
 
-    def step(self):
-        self.step()
+    def stop(self):
+        if self._running:
+            self._profiler.stop()
+            self._running = False
 
-    @staticmethod
-    def disable():
-        """
-        Runs only once and must happen before doing profiling.
-        """
-        if hasattr(HabanaProfile.disable, "invalid"):
-            if not HabanaProfile.disable.invalid:
-                HabanaProfile.HABANA_PROFILE_ENABLED = False
-        else:
-            HabanaProfile.HABANA_PROFILE_ENABLED = False
-
-    @staticmethod
-    def enable():
-        """
-        Runs only once and must happen before doing profiling.
-        """
-        if hasattr(HabanaProfile.enable, "invalid"):
-            if not HabanaProfile.enable.invalid:
-                HabanaProfile.HABANA_PROFILE_ENABLED = True
-        else:
-            HabanaProfile.HABANA_PROFILE_ENABLED = True
+    def step(self):
+        if self._running:
+            self._profiler.step()
 
 
 def check_optimum_habana_min_version(min_version):
diff --git a/tests/test_habana_profiler.py b/tests/test_habana_profiler.py
new file mode 100644
index 0000000000..646e604866
--- /dev/null
+++ b/tests/test_habana_profiler.py
@@ -0,0 +1,125 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+from unittest.mock import MagicMock
+
+import pytest
+
+from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+from optimum.habana.utils import HabanaProfile
+
+
+adapt_transformers_to_gaudi()
+
+
+PROFILER_OUTPUT_DIR = "./hpu_profile"
+
+
+@pytest.fixture
+def patched_profiler(monkeypatch):
+    p = HabanaProfile(warmup=1, active=1)
+    mock_start = MagicMock()
+    mock_stop = MagicMock()
+    mock_step = MagicMock()
+    monkeypatch.setattr(p._profiler, "start", mock_start)
+    monkeypatch.setattr(p._profiler, "stop", mock_stop)
+    monkeypatch.setattr(p._profiler, "step", mock_step)
+    yield p
+
+
+@pytest.fixture(autouse=True)
+def cleanup():
+    shutil.rmtree(PROFILER_OUTPUT_DIR, ignore_errors=True)
+    HabanaProfile._profilers = []
+
+
+def run_profiling(profiler):
+    profiler.start()
+    for _ in range(2):
+        profiler.step()
+    profiler.stop()
+
+
+def test_init_profiler_with_no_steps():
+    profiler = HabanaProfile()
+    assert profiler._profiler is None
+    assert profiler.start() is None
+    assert not profiler._running
+    assert profiler.step() is None
+    assert profiler.stop() is None
+
+
+def test_init_profiler_with_steps(patched_profiler):
+    assert not patched_profiler._running
+    assert patched_profiler._profiler is not None
+
+
+def test_start_profiling(patched_profiler):
+    patched_profiler.start()
+    assert patched_profiler._running
+    patched_profiler._profiler.start.assert_called_once()
+
+
+def test_call_step_on_profiler(patched_profiler):
+    patched_profiler.start()
+    patched_profiler.step()
+    assert patched_profiler._running
+    patched_profiler._profiler.step.assert_called_once()
+
+
+def test_stop_profiling(patched_profiler):
+    patched_profiler.start()
+    patched_profiler.stop()
+    assert not patched_profiler._running
+    patched_profiler._profiler.stop.assert_called_once()
+
+
+def test_profiler_files():
+    profiler = HabanaProfile(warmup=1, active=1)
+    run_profiling(profiler)
+    assert os.path.exists(PROFILER_OUTPUT_DIR)
+    assert len(os.listdir(PROFILER_OUTPUT_DIR)) == 1
+
+
+def test_profiler_with_name():
+    profiler = HabanaProfile(warmup=1, active=1, name="test")
+    run_profiling(profiler)
+    expected_dir = os.path.join(PROFILER_OUTPUT_DIR, "test")
+    assert os.path.exists(expected_dir)
+    assert len(os.listdir(expected_dir)) == 1
+
+
+def test_profiler_with_no_steps_doesnt_run():
+    profiler = HabanaProfile()
+    run_profiling(profiler)
+    assert not os.path.exists(PROFILER_OUTPUT_DIR)
+
+
+def test_two_profilers_can_run_sequentially():
+    profiler_0 = HabanaProfile(warmup=1, active=1)
+    run_profiling(profiler_0)
+    profiler_1 = HabanaProfile(warmup=1, active=1)
+    run_profiling(profiler_1)
+    assert os.path.exists(PROFILER_OUTPUT_DIR)
+    assert len(os.listdir(PROFILER_OUTPUT_DIR)) == 2
+
+
+def test_cannot_start_profiler_when_another_is_running(patched_profiler):
+    another_profiler = HabanaProfile(warmup=1, active=1)
+    patched_profiler.start()
+    with pytest.raises(RuntimeError):
+        another_profiler.start()

From dfcaee557483e4bf2c19da0e5e49ec1a53689bb2 Mon Sep 17 00:00:00 2001
From: Urszula Golowicz <urszula.golowicz@intel.com>
Date: Fri, 24 Jan 2025 15:36:30 +0200
Subject: [PATCH 2/6] User can collect traces in train and/or eval

Update arguments, examples, and classes to enable using profiling
in both train and eval stage of a run.

Signed-off-by: Urszula Golowicz <urszula.golowicz@intel.com>
---
 .../training/train_text_to_image_sdxl.py      | 22 +++++--
 examples/text-generation/run_generation.py    | 29 +++-----
 .../controlnet/pipeline_controlnet.py         |  1 +
 .../diffusers/pipelines/flux/pipeline_flux.py |  1 +
 .../pipelines/flux/pipeline_flux_img2img.py   |  1 +
 .../pipeline_stable_diffusion.py              |  1 +
 ...peline_stable_diffusion_image_variation.py |  1 +
 .../pipeline_stable_diffusion_img2img.py      |  1 +
 ...eline_stable_diffusion_instruct_pix2pix.py |  1 +
 .../pipeline_stable_diffusion_3.py            |  1 +
 .../pipeline_stable_diffusion_xl.py           |  1 +
 .../pipeline_stable_diffusion_xl_img2img.py   |  1 +
 .../pipeline_stable_diffusion_xl_mlperf.py    |  1 +
 .../habana/transformers/generation/utils.py   | 66 ++++++++++++-------
 optimum/habana/transformers/trainer.py        | 14 ++++
 optimum/habana/transformers/training_args.py  | 22 +++++--
 16 files changed, 111 insertions(+), 53 deletions(-)

diff --git a/examples/stable-diffusion/training/train_text_to_image_sdxl.py b/examples/stable-diffusion/training/train_text_to_image_sdxl.py
index 68c8ab27eb..5980dc7874 100755
--- a/examples/stable-diffusion/training/train_text_to_image_sdxl.py
+++ b/examples/stable-diffusion/training/train_text_to_image_sdxl.py
@@ -547,13 +547,25 @@ def parse_args(input_args=None):
         "--profiling_warmup_steps",
         default=0,
         type=int,
-        help="Number of steps to ignore for profiling.",
+        help="Number of training steps to ignore for profiling.",
     )
     parser.add_argument(
         "--profiling_steps",
         default=0,
         type=int,
-        help="Number of steps to capture for profiling.",
+        help="Number of training steps to capture for profiling.",
+    )
+    parser.add_argument(
+        "--profiling_warmup_steps_eval",
+        default=0,
+        type=int,
+        help="Number of inference steps to ignore for profiling.",
+    )
+    parser.add_argument(
+        "--profiling_steps_eval",
+        default=0,
+        type=int,
+        help="Number of inference steps to capture for profiling.",
     )
     parser.add_argument(
         "--logging_step",
@@ -1153,9 +1165,7 @@ def unwrap_model(model, training=False):
 
     unwrap_model(model=unet, training=True)
     hb_profiler = HabanaProfile(
-        warmup=args.profiling_warmup_steps,
-        active=args.profiling_steps,
-        record_shapes=False,
+        warmup=args.profiling_warmup_steps, active=args.profiling_steps, record_shapes=False, name="train"
     )
     # Train!
     total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
@@ -1521,6 +1531,8 @@ def compute_time_ids(original_size, crops_coords_top_left):
                         args.validation_prompt,
                         num_inference_steps=25,
                         generator=generator,
+                        profiling_warmup_steps=args.profiling_warmup_steps_eval,
+                        profiling_steps=args.profiling_steps_eval,
                     ).images[0]
                     for _ in range(args.num_validation_images)
                 ]
diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
index b10436c4bf..2c0c576fdc 100755
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -505,8 +505,9 @@ def assemble_prompt(prompt_size, book_path):
         elif args.batch_size < len(input_sentences):
             input_sentences = input_sentences[: args.batch_size]
 
-        def generate(size=None, reduce_recompile=False):
+        def generate(size=None, reduce_recompile=False, disable_profiling=False):
             """Generates sequences from the input sentences and returns them."""
+            profiling_steps = 0 if disable_profiling else args.profiling_steps
             timer = HabanaGenerationTime()
             timer.start()
             # Tokenization
@@ -568,7 +569,7 @@ def compute_valid_sequence_lengths_tensor(input_tokens):
                 assistant_model=assistant_model,
                 lazy_mode=use_lazy_mode,
                 hpu_graphs=args.use_hpu_graphs,
-                profiling_steps=args.profiling_steps,
+                profiling_steps=profiling_steps,
                 profiling_warmup_steps=args.profiling_warmup_steps,
                 ignore_eos=args.ignore_eos,
                 iteration_times=iteration_times,
@@ -588,10 +589,6 @@ def compute_valid_sequence_lengths_tensor(input_tokens):
                 e2e_latency,
             )
 
-        from optimum.habana.utils import HabanaProfile
-
-        # compilation stage disable profiling
-        HabanaProfile.disable()
         # Compilation
         logger.info("Graph compilation...")
         dyn_prompt_lens = args.simulate_dyn_prompt
@@ -602,10 +599,10 @@ def compute_valid_sequence_lengths_tensor(input_tokens):
             for i in range(args.warmup):
                 if dyn_prompt_lens is None:
                     print(f"Warming up iteration {i + 1}/{args.warmup}", flush=True)
-                    generate(None, args.reduce_recompile)
+                    generate(None, args.reduce_recompile, disable_profiling=True)
                 else:
                     print(f"Warming up for shape {dyn_prompt_lens[0]} iteration {i + 1}/{args.warmup}", flush=True)
-                    generate(dyn_prompt_lens[0], args.reduce_recompile)
+                    generate(dyn_prompt_lens[0], args.reduce_recompile, disable_profiling=True)
         else:
             if args.bucket_size > 0:
                 mn = min(dyn_prompt_lens)
@@ -620,11 +617,10 @@ def rounder(x):
                     lst = list(range(min_prompt_len, max_sentence_len + 1, args.bucket_size))
                     for sz in lst:
                         print(f"Warming up for shape {sz - 1} iteration {i + 1}/{args.warmup}", flush=True)
-                        generate(sz - 1, args.reduce_recompile)
+                        generate(sz - 1, args.reduce_recompile, disable_profiling=True)
         torch_hpu.synchronize()
         timer.step()
         compilation_duration = timer.last_duration
-        HabanaProfile.enable()
         total_new_tokens_generated = 0
         logger.info("Running generate...")
         first_token_latencies = []
@@ -781,7 +777,9 @@ def collate_fn(data):
 
         dataloader = DataLoader(raw_dataset, batch_size=args.batch_size, collate_fn=collate_fn)
 
-        def generate_dataset(batch):
+        def generate_dataset(batch, disable_profiling=False):
+            profiling_steps = 0 if disable_profiling else args.profiling_steps
+
             prompt = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)
             # Move inputs to target device(s)
             for t in batch:
@@ -793,18 +791,13 @@ def generate_dataset(batch):
                 generation_config=generation_config,
                 lazy_mode=use_lazy_mode,
                 hpu_graphs=args.use_hpu_graphs,
-                profiling_steps=args.profiling_steps,
+                profiling_steps=profiling_steps,
                 profiling_warmup_steps=args.profiling_warmup_steps,
                 ignore_eos=args.ignore_eos,
                 profiling_record_shapes=args.profiling_record_shapes,
             ).cpu()
             return prompt, outputs
 
-        # warmup
-        from optimum.habana.utils import HabanaProfile
-
-        # compilation stage disable profiling
-        HabanaProfile.disable()
         # Compilation
         logger.info("Graph compilation...")
         timer = HabanaGenerationTime()
@@ -820,8 +813,6 @@ def generate_dataset(batch):
         torch_hpu.synchronize()
         timer.step()
         compilation_duration = timer.last_duration
-        HabanaProfile.enable()
-
         total_new_tokens_generated = 0
         duration = 0
         separator = "-" * 50
diff --git a/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py b/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py
index 56714da448..4539b5b822 100644
--- a/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py
+++ b/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -500,6 +500,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="diffuser_pipeline",
             )
             hb_profiler.start()
 
diff --git a/optimum/habana/diffusers/pipelines/flux/pipeline_flux.py b/optimum/habana/diffusers/pipelines/flux/pipeline_flux.py
index 9c4abf3e83..760581a62e 100644
--- a/optimum/habana/diffusers/pipelines/flux/pipeline_flux.py
+++ b/optimum/habana/diffusers/pipelines/flux/pipeline_flux.py
@@ -648,6 +648,7 @@ def __call__(
             warmup=profiling_warmup_steps,
             active=profiling_steps,
             record_shapes=False,
+            name="diffuser_pipeline",
         )
         hb_profiler.start()
 
diff --git a/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py b/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py
index 17894db5ae..d6e49ea8cd 100644
--- a/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py
+++ b/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py
@@ -672,6 +672,7 @@ def __call__(
             warmup=profiling_warmup_steps,
             active=profiling_steps,
             record_shapes=False,
+            name="diffuser_pipeline",
         )
         hb_profiler.start()
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 7efe1059bc..d1537582bc 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -494,6 +494,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
             hb_profiler.start()
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
index 7cd8d23ade..be361f9186 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -309,6 +309,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
             hb_profiler.start()
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 3086b23c0c..22aa954682 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -513,6 +513,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
             hb_profiler.start()
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index a2a7ec1399..0794927a3b 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -392,6 +392,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
             hb_profiler.start()
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/optimum/habana/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
index aa117577a1..92c0e335ae 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
@@ -634,6 +634,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
 
             hb_profiler.start()
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 610f8eabba..369ede5834 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -662,6 +662,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
             hb_profiler.start()
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 6846e1a146..cd9550de20 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -502,6 +502,7 @@ def denoising_value_valid(dnv):
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
             hb_profiler.start()
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_mlperf.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_mlperf.py
index e9285fe4b8..edebec1778 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_mlperf.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_mlperf.py
@@ -756,6 +756,7 @@ def __call__(
             warmup=profiling_warmup_steps,
             active=profiling_steps,
             record_shapes=False,
+            name="stable_diffusion",
         )
         hb_profiler.start()
 
diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
index b2a3b776ab..a4c9d74311 100755
--- a/optimum/habana/transformers/generation/utils.py
+++ b/optimum/habana/transformers/generation/utils.py
@@ -2013,10 +2013,13 @@ def _contrastive_search(
 
         this_peer_finished = False
 
-        hb_profer = HabanaProfile(
-            warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=profiling_record_shapes
+        hb_profiler = HabanaProfile(
+            warmup=profiling_warmup_steps,
+            active=profiling_steps,
+            record_shapes=profiling_record_shapes,
+            name="generation",
         )
-        hb_profer.start()
+        hb_profiler.start()
         bucket_size = model_kwargs.get("bucket_size", -1)
         prev_idx = -1  # avoiding calculate cache_idx when its value is not changing
         bucket_internal = model_kwargs.get("bucket_internal", None)
@@ -2459,7 +2462,7 @@ def _contrastive_search(
 
                     torch_hpu.synchronize()
                 hb_gen_time.step()
-            hb_profer.step()
+            hb_profiler.step()
 
         if (
             model_kwargs.get("use_hpu_graphs", False)
@@ -2472,7 +2475,7 @@ def _contrastive_search(
             # Delete past key value tensors
             self._remove_past_key_values(model_kwargs)
 
-        hb_profer.stop()
+        hb_profiler.stop()
         if streamer is not None:
             streamer.end()
 
@@ -2614,10 +2617,13 @@ def _sample(
         bucket_internal = model_kwargs.get("bucket_internal", None)
         reduce_recompile = model_kwargs.get("reduce_recompile", False)
 
-        hb_profer = HabanaProfile(
-            warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=profiling_record_shapes
+        hb_profiler = HabanaProfile(
+            warmup=profiling_warmup_steps,
+            active=profiling_steps,
+            record_shapes=profiling_record_shapes,
+            name="generation",
         )
-        hb_profer.start()
+        hb_profiler.start()
 
         if not bucket_internal:
             if bucket_size >= 0:
@@ -2790,7 +2796,7 @@ def _sample(
 
                     torch_hpu.synchronize()
                 hb_gen_time.step()
-            hb_profer.step()
+            hb_profiler.step()
 
             if (
                 not model_kwargs.get("pad_done", False)
@@ -2842,7 +2848,7 @@ def _sample(
             # Delete past key value tensors
             self._remove_past_key_values(model_kwargs)
 
-        hb_profer.stop()
+        hb_profiler.stop()
 
         if streamer is not None:
             streamer.end()
@@ -3101,10 +3107,13 @@ def expand_if_needed(tensor, new_size, value, dim=-1):
             input_ids = torch.stack(return_res)
             return input_ids
 
-        hb_profer = HabanaProfile(
-            warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=profiling_record_shapes
+        hb_profiler = HabanaProfile(
+            warmup=profiling_warmup_steps,
+            active=profiling_steps,
+            record_shapes=profiling_record_shapes,
+            name="generation",
         )
-        hb_profer.start()
+        hb_profiler.start()
         this_peer_finished = False
 
         bucket_size = model_kwargs.get("bucket_size", -1)
@@ -3345,7 +3354,7 @@ def expand_if_needed(tensor, new_size, value, dim=-1):
                 else:
                     model_kwargs["cache_idx"] = model_kwargs["kv_cache_len"]
 
-            hb_profer.step()
+            hb_profiler.step()
             if self.generation_config.static_shapes:
                 is_min_length_reached = (
                     self.generation_config.min_length and cur_len >= self.generation_config.min_length
@@ -3363,7 +3372,7 @@ def expand_if_needed(tensor, new_size, value, dim=-1):
             ):
                 this_peer_finished = True
 
-            hb_profer.step()
+            hb_profiler.step()
             if hb_gen_time is not None:
                 if not time_to_first_token_done:
                     time_to_first_token_done = True
@@ -3400,7 +3409,7 @@ def expand_if_needed(tensor, new_size, value, dim=-1):
             # Delete past key value tensors
             self._remove_past_key_values(model_kwargs)
 
-        hb_profer.stop()
+        hb_profiler.stop()
 
         if self.generation_config.static_shapes:
             beam_trace = (beam_trace_idx, beam_trace_scores, beam_trace_indices, beam_trace_tokens)
@@ -3641,10 +3650,13 @@ def _constrained_beam_search(
         else:
             decoder_prompt_len = input_ids.shape[-1]
 
-        hb_profer = HabanaProfile(
-            warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=profiling_record_shapes
+        hb_profiler = HabanaProfile(
+            warmup=profiling_warmup_steps,
+            active=profiling_steps,
+            record_shapes=profiling_record_shapes,
+            name="generation",
         )
-        hb_profer.start()
+        hb_profiler.start()
 
         time_to_first_token_done = False
         while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
@@ -3774,7 +3786,7 @@ def _constrained_beam_search(
             # increase cur_len
             cur_len = cur_len + 1
 
-            hb_profer.step()
+            hb_profiler.step()
 
             if constrained_beam_scorer.is_done or get_final_stopping_criteria(
                 stopping_criteria(input_ids, scores, token_idx=cur_len)
@@ -3789,7 +3801,7 @@ def _constrained_beam_search(
                     torch_hpu.synchronize()
                 hb_gen_time.step()
 
-        hb_profer.stop()
+        hb_profiler.stop()
         sequence_outputs = constrained_beam_scorer.finalize(
             input_ids,
             beam_scores,
@@ -3923,8 +3935,12 @@ def _assisted_decoding(
             unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
         model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
 
-        hb_profer = HabanaProfile(warmup=profiling_warmup_steps, active=profiling_steps)
-        hb_profer.start()
+        hb_profiler = HabanaProfile(
+            warmup=profiling_warmup_steps,
+            active=profiling_steps,
+            name="generation",
+        )
+        hb_profiler.start()
         this_peer_finished = False
         is_first_iteration = True  # to preserve the same API in the output as other generation methods
 
@@ -4121,12 +4137,12 @@ def _assisted_decoding(
 
                     torch_hpu.synchronize()
                 hb_gen_time.step()
-            hb_profer.step()
+            hb_profiler.step()
 
             if this_peer_finished and not synced_gpus:
                 break
 
-        hb_profer.stop()
+        hb_profiler.stop()
         if streamer is not None:
             streamer.end()
 
diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py
index 0eaf6a977c..2f3f909197 100644
--- a/optimum/habana/transformers/trainer.py
+++ b/optimum/habana/transformers/trainer.py
@@ -906,6 +906,7 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args, use_reentrant: Optio
             active=self.args.profiling_steps,
             record_shapes=self.args.profiling_record_shapes,
             with_stack=self.args.profiling_with_stack,
+            name="train",
         )
         hb_profiler.start()
 
@@ -1950,6 +1951,15 @@ def evaluation_loop(
         # set a default dtype of logits
         logits_dtype: str = "float32"
 
+        hb_profiler = HabanaProfile(
+            warmup=self.args.profiling_warmup_steps_eval,
+            active=self.args.profiling_steps_eval,
+            record_shapes=self.args.profiling_record_shapes,
+            with_stack=self.args.profiling_with_stack,
+            name=description.lower(),
+        )
+        hb_profiler.start()
+
         # Main evaluation loop
         start_time_eval = time.time()
         for step, inputs in enumerate(dataloader):
@@ -2040,6 +2050,10 @@ def evaluation_loop(
             if args.use_lazy_mode:
                 self.htcore.mark_step()
 
+            hb_profiler.step()
+
+        hb_profiler.stop()
+
         # After all calls to `.gather_function`, reset to `gather_for_metrics`:
         self.gather_function = self.accelerator.gather_for_metrics
         if args.past_index and hasattr(self, "_past"):
diff --git a/optimum/habana/transformers/training_args.py b/optimum/habana/transformers/training_args.py
index 1e8baf3eaa..991dcd5615 100644
--- a/optimum/habana/transformers/training_args.py
+++ b/optimum/habana/transformers/training_args.py
@@ -137,9 +137,13 @@ class GaudiTrainingArguments(TrainingArguments):
         non_blocking_data_copy (`bool`, *optional*, defaults to `False`):
             Whether to enable async data copy when preparing inputs.
         profiling_warmup_steps (`int`, *optional*, defaults to 0):
-            Number of steps to ignore for profiling.
+            Number of training steps to ignore for profiling.
         profiling_steps (`int`, *optional*, defaults to 0):
-            Number of steps to be captured when enabling profiling.
+            Number of training steps to be captured when enabling profiling.
+        profiling_warmup_steps_eval (`int`, *optional*, defaults to 0):
+            Number of eval steps to ignore for profiling.
+        profiling_steps_eval (`int`, *optional*, defaults to 0):
+            Number of eval steps to be captured when enabling profiling.
     """
 
     use_habana: Optional[bool] = field(
@@ -294,12 +298,22 @@ class GaudiTrainingArguments(TrainingArguments):
 
     profiling_warmup_steps: Optional[int] = field(
         default=0,
-        metadata={"help": ("Number of steps to ignore for profiling.")},
+        metadata={"help": ("Number of training steps to ignore for profiling.")},
     )
 
     profiling_steps: Optional[int] = field(
         default=0,
-        metadata={"help": ("Number of steps to be captured when enabling profiling.")},
+        metadata={"help": ("Number of training steps to be captured when enabling profiling.")},
+    )
+
+    profiling_warmup_steps_eval: Optional[int] = field(
+        default=0,
+        metadata={"help": ("Number of eval steps to ignore for profiling.")},
+    )
+
+    profiling_steps_eval: Optional[int] = field(
+        default=0,
+        metadata={"help": ("Number of eval steps to be captured when enabling profiling.")},
     )
 
     profiling_record_shapes: Optional[bool] = field(

From 2612a8479a9b37292189b9458eaa9db9e44f1f6f Mon Sep 17 00:00:00 2001
From: Urszula Golowicz <urszula.golowicz@intel.com>
Date: Tue, 18 Feb 2025 10:39:03 +0200
Subject: [PATCH 3/6] Integration tests for HabanaProfile

Signed-off-by: Urszula Golowicz <urszula.golowicz@intel.com>
---
 Makefile                                      |  10 +-
 tests/test_habana_profiler_integration.py     | 120 ++++++++++++++++++
 ...ofiler.py => test_habana_profiler_unit.py} |   0
 3 files changed, 127 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_habana_profiler_integration.py
 rename tests/{test_habana_profiler.py => test_habana_profiler_unit.py} (100%)

diff --git a/Makefile b/Makefile
index ded7c6e0ff..86dd626bbc 100644
--- a/Makefile
+++ b/Makefile
@@ -34,7 +34,7 @@ style: clean
 # Run unit and integration tests
 fast_tests:
 	python -m pip install .[tests]
-	python -m pytest tests/test_gaudi_configuration.py tests/test_trainer_distributed.py tests/test_trainer.py tests/test_trainer_seq2seq.py tests/test_habana_profiler.py
+	python -m pytest tests/test_gaudi_configuration.py tests/test_trainer_distributed.py tests/test_trainer.py tests/test_trainer_seq2seq.py tests/test_habana_profiler_unit.py
 # TODO enable when CI has more servers
 #	python -m pytest test_functional_text_generation_example.py
 
@@ -87,11 +87,15 @@ slow_tests_1x: test_installs
 	python -m pip install peft==0.10.0 \
 	python -m pytest tests/test_peft_inference.py || status2=$$?; \
 	python -m pytest tests/test_pipeline.py || status3=$$?; \
-	exit $$((status1 + status2 + status3))
+	python -m pytest tests/test_habana_profiler_integration.py -v -s -m "not x8" || status4=$$?; \
+	exit $$((status1 + status2 + status3 + status4))
 
 # Run multi-card non-regression tests
 slow_tests_8x: test_installs
-	DATA_CACHE=$(DATA_CACHE) python -m pytest tests/test_examples.py -v -s -k "multi_card"
+	@status1=0; status2=0; \
+	DATA_CACHE=$(DATA_CACHE) python -m pytest tests/test_examples.py -v -s -k "multi_card" || status1=$$?; \
+	python -m pytest tests/test_habana_profiler_integration.py -v -s -m x8 || status2=$$?; \
+	exit $$((status1 + status2))
 
 # Run DeepSpeed non-regression tests
 slow_tests_deepspeed: test_installs
diff --git a/tests/test_habana_profiler_integration.py b/tests/test_habana_profiler_integration.py
new file mode 100644
index 0000000000..fa286b6ca4
--- /dev/null
+++ b/tests/test_habana_profiler_integration.py
@@ -0,0 +1,120 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import subprocess
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import pytest
+
+
+@pytest.fixture
+def oh_path():
+    cwd = Path.cwd()
+    if cwd.name.startswith("optimum-habana"):
+        oh_path = cwd
+    for parent in cwd.parents:
+        if parent.name.startswith("optimum-habana"):
+            oh_path = parent
+    return oh_path.resolve()
+
+
+@pytest.fixture
+def profiling_dir(oh_path):
+    p = oh_path / "hpu_profile"
+    yield p
+    if p.exists():
+        shutil.rmtree(p)
+
+
+@pytest.fixture
+def temp_dir():
+    td = TemporaryDirectory()
+    yield td.name
+    td.cleanup()
+
+
+def install_requirements(requirements_file_path):
+    print(f"Installing {requirements_file_path}")
+    p = subprocess.run(f"pip install -r {requirements_file_path}", shell=True)
+    assert p.returncode == 0, f"Failed to install {requirements_file_path}"
+
+
+def run_command_and_check_profiler_output(command, expected_directories, expected_num_files):
+    print(f"\nRunning command: {command}")
+    p = subprocess.run(command, shell=True)
+    rc = p.returncode
+    stdout = "" if p.stdout is None else p.stdout.decode()
+    stderr = "" if p.stderr is None else p.stderr.decode()
+    if rc != 0:
+        msg = f"Command failed with return code {rc}\nstdout: {stdout}\nstderr: {stderr}"
+    assert rc == 0, msg
+
+    for expected_dir in expected_directories:
+        assert expected_dir.exists(), f"No profiling directory {expected_dir}"
+        assert len(list(expected_dir.glob("*.json"))) == expected_num_files
+
+
+def test_integration_train_and_eval(oh_path, profiling_dir, temp_dir):
+    command = (
+        f"python3 {oh_path}/examples/text-classification/run_glue.py "
+        "--model_name_or_path bert-large-uncased-whole-word-masking "
+        "--gaudi_config_name Habana/bert-large-uncased-whole-word-masking "
+        f"--task_name mrpc --do_train --output_dir {temp_dir} "
+        "--overwrite_output_dir --learning_rate 3e-05 "
+        "--per_device_train_batch_size 1 --per_device_eval_batch_size 1 "
+        "--num_train_epochs 1 --use_habana --throughput_warmup_steps 1 "
+        "--save_strategy no --use_lazy_mode --do_eval --max_seq_length 128 "
+        "--use_hpu_graphs_for_inference --sdp_on_bf16 --profiling_steps 1 "
+        "--profiling_warmup_steps 1 --profiling_steps_eval 1 "
+        "--profiling_warmup_steps_eval 1"
+    )
+    install_requirements(f"{oh_path}/examples/text-classification/requirements.txt")
+    expected_dirs = [
+        profiling_dir / "train",
+        profiling_dir / "evaluation",
+    ]
+    run_command_and_check_profiler_output(command, expected_dirs, expected_num_files=1)
+
+
+def test_integration_text_generation(oh_path, profiling_dir, temp_dir):
+    command = (
+        f"python3 {oh_path}/examples/text-generation/run_generation.py "
+        "--model_name_or_path bigscience/bloomz-7b1 --batch_size 1 --use_kv_cache "
+        f"--max_new_tokens 100 --use_hpu_graphs --bf16 --output_dir {temp_dir} "
+        "--profiling_steps 1 --profiling_warmup_steps 1"
+    )
+    install_requirements(f"{oh_path}/examples/text-generation/requirements.txt")
+    expected_dirs = [profiling_dir / "generate"]
+    run_command_and_check_profiler_output(command, expected_dirs, expected_num_files=1)
+
+
+@pytest.mark.x8
+def test_integration_stable_diffusion(oh_path, profiling_dir, temp_dir):
+    world_size = 8
+    command = (
+        f"python {oh_path}/examples/gaudi_spawn.py --world_size {world_size} "
+        f"{oh_path}/examples/stable-diffusion/text_to_image_generation.py "
+        "--model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 "
+        '--prompts "Sailing ship painting by Van Gogh" --num_images_per_prompt 1 '
+        f"--batch_size 1 --image_save_dir {temp_dir} --scheduler euler_discrete "
+        "--use_habana --use_hpu_graphs --gaudi_config Habana/stable-diffusion --bf16 "
+        "--num_inference_steps 10 --optimize --sdp_on_bf16 "
+        "--profiling_steps 1 --profiling_warmup_steps 1"
+    )
+    install_requirements(f"{oh_path}/examples/stable-diffusion/requirements.txt")
+    expected_dirs = [profiling_dir / "stable_diffusion"]
+    run_command_and_check_profiler_output(command, expected_dirs, expected_num_files=world_size)
diff --git a/tests/test_habana_profiler.py b/tests/test_habana_profiler_unit.py
similarity index 100%
rename from tests/test_habana_profiler.py
rename to tests/test_habana_profiler_unit.py

From 018158a8b5292343b7616e8bb2c247c1de69bec2 Mon Sep 17 00:00:00 2001
From: Urszula Golowicz <urszula.golowicz@intel.com>
Date: Thu, 3 Apr 2025 14:53:59 +0300
Subject: [PATCH 4/6] Sequence profiling in text generation

Add the option to profile whole sequences in text-generation.
Use --profile_whole_sequences boolean arg to enable it.

Signed-off-by: Urszula Golowicz <urszula.golowicz@intel.com>
---
 examples/text-generation/run_generation.py    |  42 +++-
 .../habana/transformers/generation/utils.py   | 195 +++++++-----------
 2 files changed, 103 insertions(+), 134 deletions(-)

diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
index 2c0c576fdc..9a18ba483a 100755
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -37,7 +37,7 @@
     save_model,
 )
 
-from optimum.habana.utils import HabanaGenerationTime, get_hpu_memory_stats
+from optimum.habana.utils import HabanaGenerationTime, HabanaProfile, get_hpu_memory_stats
 
 
 logging.basicConfig(
@@ -149,6 +149,11 @@ def setup_parser(parser):
         action="store_true",
         help="Record shapes when enabling profiling.",
     )
+    parser.add_argument(
+        "--profile_whole_sequences",
+        action="store_true",
+        help="When set, profiling step means generation of one whole sequence (not one token).",
+    )
     parser.add_argument(
         "--prompt",
         default=None,
@@ -442,6 +447,20 @@ def main():
     if args.sdp_on_bf16:
         torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
 
+    active_profiler = HabanaProfile(
+        warmup=args.profiling_warmup_steps,
+        active=args.profiling_steps,
+        record_shapes=args.profiling_record_shapes,
+        name="generate",
+    )
+    disabled_profiler = HabanaProfile()
+    if args.profile_whole_sequences:
+        per_sequence_profiler = active_profiler
+        per_token_profiler = disabled_profiler
+    else:
+        per_sequence_profiler = disabled_profiler
+        per_token_profiler = active_profiler
+
     if args.dataset_name is None:
         # Benchmark over the prompts below
         if args.prompt:
@@ -507,7 +526,7 @@ def assemble_prompt(prompt_size, book_path):
 
         def generate(size=None, reduce_recompile=False, disable_profiling=False):
             """Generates sequences from the input sentences and returns them."""
-            profiling_steps = 0 if disable_profiling else args.profiling_steps
+            profiler = disabled_profiler if disable_profiling else per_token_profiler
             timer = HabanaGenerationTime()
             timer.start()
             # Tokenization
@@ -569,11 +588,9 @@ def compute_valid_sequence_lengths_tensor(input_tokens):
                 assistant_model=assistant_model,
                 lazy_mode=use_lazy_mode,
                 hpu_graphs=args.use_hpu_graphs,
-                profiling_steps=profiling_steps,
-                profiling_warmup_steps=args.profiling_warmup_steps,
                 ignore_eos=args.ignore_eos,
                 iteration_times=iteration_times,
-                profiling_record_shapes=args.profiling_record_shapes,
+                profiler=profiler,
             ).cpu()
             timer.step()
             first_token_time = iteration_times[0] + encode_duration
@@ -628,12 +645,14 @@ def rounder(x):
         e2e_latencies = []
         timer.step()
         # Benchmark over n_iterations iterations
+        per_sequence_profiler.start()
         if dyn_prompt_lens is None:
             for i in range(args.n_iterations):
                 generated, first_token_time, rest_token_time, e2e_latency = generate(None, args.reduce_recompile)
                 first_token_latencies.append(first_token_time)
                 rest_token_latencies.append(rest_token_time)
                 e2e_latencies.append(e2e_latency)
+                per_sequence_profiler.step()
         else:
             repeated_prompt_len = cycle(dyn_prompt_lens)
             for i in range(args.n_iterations):
@@ -643,9 +662,11 @@ def rounder(x):
                 first_token_latencies.append(first_token_time)
                 rest_token_latencies.append(rest_token_time)
                 e2e_latencies.append(e2e_latency)
+                per_sequence_profiler.step()
         timer.step()
         logger.info("Finished running generate")
         duration = timer.last_duration
+        per_sequence_profiler.stop()
         total_new_tokens_generated = args.n_iterations * args.batch_size * args.max_new_tokens
         throughput = total_new_tokens_generated / duration
         # Calculate average latencies
@@ -778,7 +799,7 @@ def collate_fn(data):
         dataloader = DataLoader(raw_dataset, batch_size=args.batch_size, collate_fn=collate_fn)
 
         def generate_dataset(batch, disable_profiling=False):
-            profiling_steps = 0 if disable_profiling else args.profiling_steps
+            profiler = disabled_profiler if disable_profiling else per_token_profiler
 
             prompt = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)
             # Move inputs to target device(s)
@@ -791,10 +812,8 @@ def generate_dataset(batch, disable_profiling=False):
                 generation_config=generation_config,
                 lazy_mode=use_lazy_mode,
                 hpu_graphs=args.use_hpu_graphs,
-                profiling_steps=profiling_steps,
-                profiling_warmup_steps=args.profiling_warmup_steps,
                 ignore_eos=args.ignore_eos,
-                profiling_record_shapes=args.profiling_record_shapes,
+                profiler=profiler,
             ).cpu()
             return prompt, outputs
 
@@ -817,8 +836,11 @@ def generate_dataset(batch, disable_profiling=False):
         duration = 0
         separator = "-" * 50
         logger.info("Running generate dataset...")
+
         timer = HabanaGenerationTime()
         timer.start()
+        per_sequence_profiler.start()
+
         for i, batch in enumerate(dataloader):
             timer.step()
             prompt, outputs = generate_dataset(batch)
@@ -834,7 +856,9 @@ def generate_dataset(batch, disable_profiling=False):
             print(separator)
             if args.run_partial_dataset and args.n_iterations == i + 1:
                 break
+            per_sequence_profiler.step()
         timer.step()
+        per_sequence_profiler.stop()
 
         throughput = total_new_tokens_generated / duration
         # Print Stats
diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
index a4c9d74311..9a0da8efc8 100755
--- a/optimum/habana/transformers/generation/utils.py
+++ b/optimum/habana/transformers/generation/utils.py
@@ -1090,10 +1090,8 @@ def generate(
         negative_prompt_attention_mask: Optional[torch.Tensor] = None,
         lazy_mode: Optional[bool] = False,
         hpu_graphs: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
         iteration_times: Optional[List[float]] = None,
-        profiling_record_shapes: Optional[bool] = False,
+        profiler: Optional[HabanaProfile] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         r"""
@@ -1163,12 +1161,8 @@ def generate(
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             hpu_graphs (`bool`, *optional*, defaults to `False`):
                 Whether to use HPU graphs for inference.
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
+            profiler (`HabanaProfile`, *optional*, defaults to None):
+                HabanaProfile object to use for profiling.
             kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
@@ -1630,8 +1624,7 @@ def generate(
                 streamer=streamer,
                 lazy_mode=lazy_mode,
                 ignore_eos=generation_config.ignore_eos,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
+                profiler=profiler,
                 hb_gen_time=hb_gen_time,
                 **model_kwargs,
             )
@@ -1670,10 +1663,8 @@ def generate(
                 streamer=streamer,
                 lazy_mode=lazy_mode,
                 ignore_eos=generation_config.ignore_eos,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
+                profiler=profiler,
                 hb_gen_time=hb_gen_time,
-                profiling_record_shapes=profiling_record_shapes,
                 **model_kwargs,
             )
 
@@ -1696,10 +1687,8 @@ def generate(
                 streamer=streamer,
                 lazy_mode=lazy_mode,
                 ignore_eos=generation_config.ignore_eos,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
+                profiler=profiler,
                 hb_gen_time=hb_gen_time,
-                profiling_record_shapes=profiling_record_shapes,
                 **model_kwargs,
             )
 
@@ -1732,10 +1721,8 @@ def generate(
                 generation_config=generation_config,
                 synced_gpus=synced_gpus,
                 lazy_mode=lazy_mode,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
+                profiler=profiler,
                 hb_gen_time=hb_gen_time,
-                profiling_record_shapes=profiling_record_shapes,
                 **model_kwargs,
             )
 
@@ -1767,10 +1754,8 @@ def generate(
                 generation_config=generation_config,
                 synced_gpus=synced_gpus,
                 lazy_mode=lazy_mode,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
+                profiler=profiler,
                 hb_gen_time=hb_gen_time,
-                profiling_record_shapes=profiling_record_shapes,
                 **model_kwargs,
             )
 
@@ -1842,10 +1827,8 @@ def typeerror():
                 generation_config=generation_config,
                 synced_gpus=synced_gpus,
                 lazy_mode=lazy_mode,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
+                profiler=profiler,
                 hb_gen_time=hb_gen_time,
-                profiling_record_shapes=profiling_record_shapes,
                 **model_kwargs,
             )
 
@@ -1923,10 +1906,8 @@ def _contrastive_search(
         streamer: Optional["BaseStreamer"],
         lazy_mode: Optional[bool] = False,
         ignore_eos: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
+        profiler: Optional[HabanaProfile] = None,
         hb_gen_time: Optional[HabanaGenerationTime] = None,
-        profiling_record_shapes: Optional[bool] = False,
         **model_kwargs,
     ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
         r"""
@@ -1960,12 +1941,8 @@ def _contrastive_search(
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             ignore_eos (`bool`, *optional*, defaults to `False`):
                 Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
+            profiler (`HabanaProfile`, *optional*, defaults to None):
+                HabanaProfile object to use for profiling.
             model_kwargs:
                 Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                 If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -2013,13 +1990,9 @@ def _contrastive_search(
 
         this_peer_finished = False
 
-        hb_profiler = HabanaProfile(
-            warmup=profiling_warmup_steps,
-            active=profiling_steps,
-            record_shapes=profiling_record_shapes,
-            name="generation",
-        )
-        hb_profiler.start()
+        if profiler is not None:
+            profiler.start()
+
         bucket_size = model_kwargs.get("bucket_size", -1)
         prev_idx = -1  # avoiding calculate cache_idx when its value is not changing
         bucket_internal = model_kwargs.get("bucket_internal", None)
@@ -2462,7 +2435,9 @@ def _contrastive_search(
 
                     torch_hpu.synchronize()
                 hb_gen_time.step()
-            hb_profiler.step()
+
+            if profiler is not None:
+                profiler.step()
 
         if (
             model_kwargs.get("use_hpu_graphs", False)
@@ -2475,7 +2450,9 @@ def _contrastive_search(
             # Delete past key value tensors
             self._remove_past_key_values(model_kwargs)
 
-        hb_profiler.stop()
+        if profiler is not None:
+            profiler.stop()
+
         if streamer is not None:
             streamer.end()
 
@@ -2531,10 +2508,8 @@ def _sample(
         streamer: Optional["BaseStreamer"],
         lazy_mode: Optional[bool] = False,
         ignore_eos: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
+        profiler: Optional[HabanaProfile] = None,
         hb_gen_time: Optional[HabanaGenerationTime] = None,
-        profiling_record_shapes: Optional[bool] = False,
         **model_kwargs,
     ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
         r"""
@@ -2562,12 +2537,8 @@ def _sample(
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             ignore_eos (`bool`, *optional*, defaults to `False`):
                 Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
+            profiler (`HabanaProfile`, *optional*, defaults to None):
+                HabanaProfile object to use for profiling.
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                 an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -2617,13 +2588,8 @@ def _sample(
         bucket_internal = model_kwargs.get("bucket_internal", None)
         reduce_recompile = model_kwargs.get("reduce_recompile", False)
 
-        hb_profiler = HabanaProfile(
-            warmup=profiling_warmup_steps,
-            active=profiling_steps,
-            record_shapes=profiling_record_shapes,
-            name="generation",
-        )
-        hb_profiler.start()
+        if profiler is not None:
+            profiler.start()
 
         if not bucket_internal:
             if bucket_size >= 0:
@@ -2796,7 +2762,9 @@ def _sample(
 
                     torch_hpu.synchronize()
                 hb_gen_time.step()
-            hb_profiler.step()
+
+            if profiler is not None:
+                profiler.step()
 
             if (
                 not model_kwargs.get("pad_done", False)
@@ -2848,7 +2816,8 @@ def _sample(
             # Delete past key value tensors
             self._remove_past_key_values(model_kwargs)
 
-        hb_profiler.stop()
+            if profiler is not None:
+                profiler.stop()
 
         if streamer is not None:
             streamer.end()
@@ -2899,10 +2868,8 @@ def _beam_search(
         generation_config: GaudiGenerationConfig,
         synced_gpus: bool,
         lazy_mode: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
+        profiler: Optional[HabanaProfile] = None,
         hb_gen_time: Optional[HabanaGenerationTime] = None,
-        profiling_record_shapes: Optional[bool] = False,
         **model_kwargs,
     ) -> Union[GenerateBeamOutput, torch.LongTensor]:
         r"""
@@ -2928,12 +2895,8 @@ def _beam_search(
                 `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
+            profiler (`HabanaProfile`, *optional*, defaults to None):
+                HabanaProfile object to use for profiling.
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                 an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -3107,13 +3070,9 @@ def expand_if_needed(tensor, new_size, value, dim=-1):
             input_ids = torch.stack(return_res)
             return input_ids
 
-        hb_profiler = HabanaProfile(
-            warmup=profiling_warmup_steps,
-            active=profiling_steps,
-            record_shapes=profiling_record_shapes,
-            name="generation",
-        )
-        hb_profiler.start()
+        if profiler is not None:
+            profiler.start()
+
         this_peer_finished = False
 
         bucket_size = model_kwargs.get("bucket_size", -1)
@@ -3354,7 +3313,9 @@ def expand_if_needed(tensor, new_size, value, dim=-1):
                 else:
                     model_kwargs["cache_idx"] = model_kwargs["kv_cache_len"]
 
-            hb_profiler.step()
+            if profiler is not None:
+                profiler.step()
+
             if self.generation_config.static_shapes:
                 is_min_length_reached = (
                     self.generation_config.min_length and cur_len >= self.generation_config.min_length
@@ -3372,7 +3333,9 @@ def expand_if_needed(tensor, new_size, value, dim=-1):
             ):
                 this_peer_finished = True
 
-            hb_profiler.step()
+            if profiler is not None:
+                profiler.step()
+
             if hb_gen_time is not None:
                 if not time_to_first_token_done:
                     time_to_first_token_done = True
@@ -3409,7 +3372,8 @@ def expand_if_needed(tensor, new_size, value, dim=-1):
             # Delete past key value tensors
             self._remove_past_key_values(model_kwargs)
 
-        hb_profiler.stop()
+        if profiler is not None:
+            profiler.stop()
 
         if self.generation_config.static_shapes:
             beam_trace = (beam_trace_idx, beam_trace_scores, beam_trace_indices, beam_trace_tokens)
@@ -3488,10 +3452,8 @@ def _group_beam_search(
         generation_config: GaudiGenerationConfig,
         synced_gpus: bool,
         lazy_mode: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
+        profiler: Optional[HabanaProfile] = None,
         hb_gen_time: Optional[HabanaGenerationTime] = None,
-        profiling_record_shapes: Optional[bool] = False,
         **model_kwargs,
     ):
         r"""
@@ -3517,12 +3479,8 @@ def _group_beam_search(
                 `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
+            profiler (`HabanaProfile`, *optional*, defaults to None):
+                HabanaProfile object to use for profiling.
             model_kwargs:
                 Additional model specific kwargs that will be forwarded to the `forward` function of the model. If
                 model is an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -3546,10 +3504,8 @@ def _constrained_beam_search(
         generation_config: GaudiGenerationConfig,
         synced_gpus: bool,
         lazy_mode: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
+        profiler: Optional[HabanaProfile] = None,
         hb_gen_time: Optional[HabanaGenerationTime] = None,
-        profiling_record_shapes: Optional[bool] = False,
         **model_kwargs,
     ) -> Union[GenerateBeamOutput, torch.LongTensor]:
         r"""
@@ -3576,12 +3532,8 @@ def _constrained_beam_search(
                 `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
+            profiler (`HabanaProfile`, *optional*, defaults to None):
+                HabanaProfile object to use for profiling.
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                 an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -3650,13 +3602,8 @@ def _constrained_beam_search(
         else:
             decoder_prompt_len = input_ids.shape[-1]
 
-        hb_profiler = HabanaProfile(
-            warmup=profiling_warmup_steps,
-            active=profiling_steps,
-            record_shapes=profiling_record_shapes,
-            name="generation",
-        )
-        hb_profiler.start()
+        if profiler is not None:
+            profiler.start()
 
         time_to_first_token_done = False
         while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
@@ -3786,7 +3733,8 @@ def _constrained_beam_search(
             # increase cur_len
             cur_len = cur_len + 1
 
-            hb_profiler.step()
+            if profiler is not None:
+                profiler.step()
 
             if constrained_beam_scorer.is_done or get_final_stopping_criteria(
                 stopping_criteria(input_ids, scores, token_idx=cur_len)
@@ -3801,7 +3749,9 @@ def _constrained_beam_search(
                     torch_hpu.synchronize()
                 hb_gen_time.step()
 
-        hb_profiler.stop()
+        if profiler is not None:
+            profiler.stop()
+
         sequence_outputs = constrained_beam_scorer.finalize(
             input_ids,
             beam_scores,
@@ -3856,10 +3806,8 @@ def _assisted_decoding(
         streamer: Optional["BaseStreamer"],
         lazy_mode: Optional[bool] = False,
         ignore_eos: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
+        profiler: Optional[HabanaProfile] = None,
         hb_gen_time: Optional[HabanaGenerationTime] = None,
-        profiling_record_shapes: Optional[bool] = False,
         **model_kwargs,
     ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
         r"""
@@ -3890,12 +3838,8 @@ def _assisted_decoding(
                 through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
+            profiler (`HabanaProfile`, *optional*, defaults to None):
+                HabanaProfile object to use for profiling.
             model_kwargs:
                 Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                 If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -3935,12 +3879,9 @@ def _assisted_decoding(
             unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
         model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
 
-        hb_profiler = HabanaProfile(
-            warmup=profiling_warmup_steps,
-            active=profiling_steps,
-            name="generation",
-        )
-        hb_profiler.start()
+        if profiler is not None:
+            profiler.start()
+
         this_peer_finished = False
         is_first_iteration = True  # to preserve the same API in the output as other generation methods
 
@@ -4137,12 +4078,16 @@ def _assisted_decoding(
 
                     torch_hpu.synchronize()
                 hb_gen_time.step()
-            hb_profiler.step()
+
+            if profiler is not None:
+                profiler.step()
 
             if this_peer_finished and not synced_gpus:
                 break
 
-        hb_profiler.stop()
+        if profiler is not None:
+            profiler.stop()
+
         if streamer is not None:
             streamer.end()
 

From 97319933271e69a248562935c36bd083f9ce7038 Mon Sep 17 00:00:00 2001
From: Adam Stachowicz <astachowicz@habana.ai>
Date: Fri, 4 Jul 2025 21:06:30 +0300
Subject: [PATCH 5/6] Style changes

---
 examples/text-generation/run_generation.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
index f13300b0f4..cb3caf3ff2 100755
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -510,7 +510,6 @@ def main():
         per_sequence_profiler = disabled_profiler
         per_token_profiler = active_profiler
 
-
     if args.dataset_name == "mlcommons":
         # Benchmark over the prompts below
         def get_ds(args):

From 9e5536bc3c11d19f4a5564723b439e4bef2fea42 Mon Sep 17 00:00:00 2001
From: Adam Stachowicz <105052242+astachowiczhabana@users.noreply.github.com>
Date: Wed, 9 Jul 2025 12:36:08 +0200
Subject: [PATCH 6/6] Update test_habana_profiler_integration.py

---
 tests/test_habana_profiler_integration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_habana_profiler_integration.py b/tests/test_habana_profiler_integration.py
index fa286b6ca4..cb17bfba5c 100644
--- a/tests/test_habana_profiler_integration.py
+++ b/tests/test_habana_profiler_integration.py
@@ -113,7 +113,7 @@ def test_integration_stable_diffusion(oh_path, profiling_dir, temp_dir):
         f"--batch_size 1 --image_save_dir {temp_dir} --scheduler euler_discrete "
         "--use_habana --use_hpu_graphs --gaudi_config Habana/stable-diffusion --bf16 "
         "--num_inference_steps 10 --optimize --sdp_on_bf16 "
-        "--profiling_steps 1 --profiling_warmup_steps 1"
+        "--profiling_steps 1 --profiling_warmup_steps 1 --distributed"
     )
     install_requirements(f"{oh_path}/examples/stable-diffusion/requirements.txt")
     expected_dirs = [profiling_dir / "stable_diffusion"]