diff --git a/Makefile b/Makefile
index e93f23296a..8de3d66c05 100644
--- a/Makefile
+++ b/Makefile
@@ -36,7 +36,7 @@ style: clean
 # Run unit and integration tests
 fast_tests:
 	python -m pip install .[tests]
-	python -m pytest tests/test_gaudi_configuration.py tests/test_trainer_distributed.py tests/test_trainer.py tests/test_trainer_seq2seq.py
+	python -m pytest tests/test_gaudi_configuration.py tests/test_trainer_distributed.py tests/test_trainer.py tests/test_trainer_seq2seq.py tests/test_habana_profiler_unit.py
 # TODO enable when CI has more servers
 #	python -m pytest test_functional_text_generation_example.py
 
@@ -89,11 +89,15 @@ slow_tests_1x: test_installs
 	python -m pip install peft==0.10.0; \
 	python -m pytest tests/test_peft_inference.py || status2=$$?; \
 	python -m pytest tests/test_pipeline.py || status3=$$?; \
-	exit $$((status1 + status2 + status3))
+	python -m pytest tests/test_habana_profiler_integration.py -v -s -m "not x8" || status4=$$?; \
+	exit $$((status1 + status2 + status3 + status4))
 
 # Run multi-card non-regression tests
 slow_tests_8x: test_installs
-	DATA_CACHE=$(DATA_CACHE) python -m pytest tests/test_examples.py -v -s -k "multi_card"
+	@status1=0; status2=0; \
+	DATA_CACHE=$(DATA_CACHE) python -m pytest tests/test_examples.py -v -s -k "multi_card" || status1=$$?; \
+	python -m pytest tests/test_habana_profiler_integration.py -v -s -m x8 || status2=$$?; \
+	exit $$((status1 + status2))
 
 # Run DeepSpeed non-regression tests
 slow_tests_deepspeed: test_installs
diff --git a/examples/stable-diffusion/training/train_text_to_image_sdxl.py b/examples/stable-diffusion/training/train_text_to_image_sdxl.py
index d585e91022..7c1c87d14b 100755
--- a/examples/stable-diffusion/training/train_text_to_image_sdxl.py
+++ b/examples/stable-diffusion/training/train_text_to_image_sdxl.py
@@ -547,13 +547,25 @@ def parse_args(input_args=None):
         "--profiling_warmup_steps",
         default=0,
         type=int,
-        help="Number of steps to ignore for profiling.",
+        help="Number of training steps to ignore for profiling.",
     )
     parser.add_argument(
         "--profiling_steps",
         default=0,
         type=int,
-        help="Number of steps to capture for profiling.",
+        help="Number of training steps to capture for profiling.",
+    )
+    parser.add_argument(
+        "--profiling_warmup_steps_eval",
+        default=0,
+        type=int,
+        help="Number of inference steps to ignore for profiling.",
+    )
+    parser.add_argument(
+        "--profiling_steps_eval",
+        default=0,
+        type=int,
+        help="Number of inference steps to capture for profiling.",
     )
     parser.add_argument(
         "--logging_step",
@@ -1153,9 +1165,7 @@ def unwrap_model(model, training=False):
 
     unwrap_model(model=unet, training=True)
     hb_profiler = HabanaProfile(
-        warmup=args.profiling_warmup_steps,
-        active=args.profiling_steps,
-        record_shapes=False,
+        warmup=args.profiling_warmup_steps, active=args.profiling_steps, record_shapes=False, name="train"
     )
     # Train!
     total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
@@ -1521,6 +1531,8 @@ def compute_time_ids(original_size, crops_coords_top_left):
                         args.validation_prompt,
                         num_inference_steps=25,
                         generator=generator,
+                        profiling_warmup_steps=args.profiling_warmup_steps_eval,
+                        profiling_steps=args.profiling_steps_eval,
                     ).images[0]
                     for _ in range(args.num_validation_images)
                 ]
diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
old mode 100644
new mode 100755
index ad0e168764..cb3caf3ff2
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -157,6 +157,11 @@ def setup_parser(parser):
         action="store_true",
         help="Record shapes when enabling profiling.",
     )
+    parser.add_argument(
+        "--profile_whole_sequences",
+        action="store_true",
+        help="When set, profiling step means generation of one whole sequence (not one token).",
+    )
     parser.add_argument(
         "--prompt",
         default=None,
@@ -486,10 +491,24 @@ def main():
 
     import habana_frameworks.torch.hpu as torch_hpu
 
+    from optimum.habana.utils import HabanaGenerationTime, HabanaProfile, get_hpu_memory_stats
+
     if args.sdp_on_bf16:
         torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
 
-    from optimum.habana.utils import HabanaGenerationTime, get_hpu_memory_stats
+    active_profiler = HabanaProfile(
+        warmup=args.profiling_warmup_steps,
+        active=args.profiling_steps,
+        record_shapes=args.profiling_record_shapes,
+        name="generate",
+    )
+    disabled_profiler = HabanaProfile()
+    if args.profile_whole_sequences:
+        per_sequence_profiler = active_profiler
+        per_token_profiler = disabled_profiler
+    else:
+        per_sequence_profiler = disabled_profiler
+        per_token_profiler = active_profiler
 
     if args.dataset_name == "mlcommons":
         # Benchmark over the prompts below
@@ -708,8 +727,9 @@ def assemble_prompt(prompt_size, book_path):
         elif args.batch_size < len(input_sentences):
             input_sentences = input_sentences[: args.batch_size]
 
-        def generate(size=None, reduce_recompile=False):
+        def generate(size=None, reduce_recompile=False, disable_profiling=False):
             """Generates sequences from the input sentences and returns them."""
+            profiler = disabled_profiler if disable_profiling else per_token_profiler
             timer = HabanaGenerationTime()
             timer.start()
             # Tokenization
@@ -770,11 +790,9 @@ def compute_valid_sequence_lengths_tensor(input_tokens):
                 assistant_model=assistant_model,
                 lazy_mode=use_lazy_mode,
                 hpu_graphs=args.use_hpu_graphs,
-                profiling_steps=args.profiling_steps,
-                profiling_warmup_steps=args.profiling_warmup_steps,
                 ignore_eos=args.ignore_eos,
                 iteration_times=iteration_times,
-                profiling_record_shapes=args.profiling_record_shapes,
+                profiler=profiler,
             ).cpu()
             timer.step()
             first_token_time = iteration_times[0] + encode_duration
@@ -790,10 +808,6 @@ def compute_valid_sequence_lengths_tensor(input_tokens):
                 e2e_latency,
             )
 
-        from optimum.habana.utils import HabanaProfile
-
-        # compilation stage disable profiling
-        HabanaProfile.disable()
         # Compilation
         logger.info("Graph compilation...")
         dyn_prompt_lens = args.simulate_dyn_prompt
@@ -804,10 +818,10 @@ def compute_valid_sequence_lengths_tensor(input_tokens):
             for i in range(args.warmup):
                 if dyn_prompt_lens is None:
                     print(f"Warming up iteration {i + 1}/{args.warmup}", flush=True)
-                    generate(None, args.reduce_recompile)
+                    generate(None, args.reduce_recompile, disable_profiling=True)
                 else:
                     print(f"Warming up for shape {dyn_prompt_lens[0]} iteration {i + 1}/{args.warmup}", flush=True)
-                    generate(dyn_prompt_lens[0], args.reduce_recompile)
+                    generate(dyn_prompt_lens[0], args.reduce_recompile, disable_profiling=True)
         else:
             if args.bucket_size > 0:
                 mn = min(dyn_prompt_lens)
@@ -822,11 +836,10 @@ def rounder(x):
                     lst = list(range(min_prompt_len, max_sentence_len + 1, args.bucket_size))
                     for sz in lst:
                         print(f"Warming up for shape {sz - 1} iteration {i + 1}/{args.warmup}", flush=True)
-                        generate(sz - 1, args.reduce_recompile)
+                        generate(sz - 1, args.reduce_recompile, disable_profiling=True)
         torch_hpu.synchronize()
         timer.step()
         compilation_duration = timer.last_duration
-        HabanaProfile.enable()
         total_new_tokens_generated = 0
         logger.info("Running generate...")
         first_token_latencies = []
@@ -834,12 +847,14 @@ def rounder(x):
         e2e_latencies = []
         timer.step()
         # Benchmark over n_iterations iterations
+        per_sequence_profiler.start()
         if dyn_prompt_lens is None:
             for i in range(args.n_iterations):
                 generated, first_token_time, rest_token_time, e2e_latency = generate(None, args.reduce_recompile)
                 first_token_latencies.append(first_token_time)
                 rest_token_latencies.append(rest_token_time)
                 e2e_latencies.append(e2e_latency)
+                per_sequence_profiler.step()
         else:
             repeated_prompt_len = cycle(dyn_prompt_lens)
             for i in range(args.n_iterations):
@@ -849,9 +864,11 @@ def rounder(x):
                 first_token_latencies.append(first_token_time)
                 rest_token_latencies.append(rest_token_time)
                 e2e_latencies.append(e2e_latency)
+                per_sequence_profiler.step()
         timer.step()
         logger.info("Finished running generate")
         duration = timer.last_duration
+        per_sequence_profiler.stop()
         total_new_tokens_generated = args.n_iterations * args.batch_size * args.max_new_tokens
         throughput = total_new_tokens_generated / duration
         # Calculate average latencies
@@ -983,7 +1000,9 @@ def collate_fn(data):
 
         dataloader = DataLoader(raw_dataset, batch_size=args.batch_size, collate_fn=collate_fn)
 
-        def generate_dataset(batch):
+        def generate_dataset(batch, disable_profiling=False):
+            profiler = disabled_profiler if disable_profiling else per_token_profiler
+
             prompt = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)
             # Move inputs to target device(s)
             for t in batch:
@@ -995,18 +1014,11 @@ def generate_dataset(batch):
                 generation_config=generation_config,
                 lazy_mode=use_lazy_mode,
                 hpu_graphs=args.use_hpu_graphs,
-                profiling_steps=args.profiling_steps,
-                profiling_warmup_steps=args.profiling_warmup_steps,
                 ignore_eos=args.ignore_eos,
-                profiling_record_shapes=args.profiling_record_shapes,
+                profiler=profiler,
             ).cpu()
             return prompt, outputs
 
-        # warmup
-        from optimum.habana.utils import HabanaProfile
-
-        # compilation stage disable profiling
-        HabanaProfile.disable()
         # Compilation
         logger.info("Graph compilation...")
         timer = HabanaGenerationTime()
@@ -1022,14 +1034,15 @@ def generate_dataset(batch):
         torch_hpu.synchronize()
         timer.step()
         compilation_duration = timer.last_duration
-        HabanaProfile.enable()
-
         total_new_tokens_generated = 0
         duration = 0
         separator = "-" * 50
         logger.info("Running generate dataset...")
+
         timer = HabanaGenerationTime()
         timer.start()
+        per_sequence_profiler.start()
+
         for i, batch in enumerate(dataloader):
             timer.step()
             prompt, outputs = generate_dataset(batch)
@@ -1045,7 +1058,9 @@ def generate_dataset(batch):
             print(separator)
             if args.run_partial_dataset and args.n_iterations == i + 1:
                 break
+            per_sequence_profiler.step()
         timer.step()
+        per_sequence_profiler.stop()
 
         throughput = total_new_tokens_generated / duration
         # Print Stats
diff --git a/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py b/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py
index 56714da448..4539b5b822 100644
--- a/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py
+++ b/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -500,6 +500,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="diffuser_pipeline",
             )
             hb_profiler.start()
 
diff --git a/optimum/habana/diffusers/pipelines/flux/pipeline_flux.py b/optimum/habana/diffusers/pipelines/flux/pipeline_flux.py
index 4caaf5bfe8..38908696b9 100644
--- a/optimum/habana/diffusers/pipelines/flux/pipeline_flux.py
+++ b/optimum/habana/diffusers/pipelines/flux/pipeline_flux.py
@@ -470,6 +470,7 @@ def __call__(
             warmup=profiling_warmup_steps,
             active=profiling_steps,
             record_shapes=False,
+            name="diffuser_pipeline",
         )
         hb_profiler.start()
 
diff --git a/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py b/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py
index ee2be57274..9a2f91bcff 100644
--- a/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py
+++ b/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py
@@ -502,6 +502,7 @@ def __call__(
             warmup=profiling_warmup_steps,
             active=profiling_steps,
             record_shapes=False,
+            name="diffuser_pipeline",
         )
         hb_profiler.start()
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 7efe1059bc..d1537582bc 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -494,6 +494,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
             hb_profiler.start()
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
index 7cd8d23ade..be361f9186 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -309,6 +309,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
             hb_profiler.start()
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 3086b23c0c..22aa954682 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -513,6 +513,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
             hb_profiler.start()
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index a2a7ec1399..0794927a3b 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -392,6 +392,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
             hb_profiler.start()
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/optimum/habana/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
index 1565562936..61c46cf37f 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
@@ -542,6 +542,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
 
             hb_profiler.start()
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 610f8eabba..369ede5834 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -662,6 +662,7 @@ def __call__(
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
             hb_profiler.start()
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 6846e1a146..cd9550de20 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -502,6 +502,7 @@ def denoising_value_valid(dnv):
                 warmup=profiling_warmup_steps,
                 active=profiling_steps,
                 record_shapes=False,
+                name="stable_diffusion",
             )
             hb_profiler.start()
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_mlperf.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_mlperf.py
index e9285fe4b8..edebec1778 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_mlperf.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_mlperf.py
@@ -756,6 +756,7 @@ def __call__(
             warmup=profiling_warmup_steps,
             active=profiling_steps,
             record_shapes=False,
+            name="stable_diffusion",
         )
         hb_profiler.start()
 
diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
index c5097d15ca..e92b07d6c9 100755
--- a/optimum/habana/transformers/generation/utils.py
+++ b/optimum/habana/transformers/generation/utils.py
@@ -1132,10 +1132,8 @@ def generate(
         use_model_defaults: Optional[bool] = None,
         lazy_mode: Optional[bool] = False,
         hpu_graphs: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
         iteration_times: Optional[List[float]] = None,
-        profiling_record_shapes: Optional[bool] = False,
+        profiler: Optional[HabanaProfile] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         r"""
@@ -1210,12 +1208,8 @@ def generate(
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             hpu_graphs (`bool`, *optional*, defaults to `False`):
                 Whether to use HPU graphs for inference.
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
+            profiler (`HabanaProfile`, *optional*, defaults to None):
+                HabanaProfile object to use for profiling.
             kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
@@ -1683,8 +1677,7 @@ def generate(
                 streamer=streamer,
                 lazy_mode=lazy_mode,
                 ignore_eos=generation_config.ignore_eos,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
+                profiler=profiler,
                 hb_gen_time=hb_gen_time,
                 **model_kwargs,
             )
@@ -1723,10 +1716,8 @@ def generate(
                 streamer=streamer,
                 lazy_mode=lazy_mode,
                 ignore_eos=generation_config.ignore_eos,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
+                profiler=profiler,
                 hb_gen_time=hb_gen_time,
-                profiling_record_shapes=profiling_record_shapes,
                 **model_kwargs,
             )
 
@@ -1749,10 +1740,8 @@ def generate(
                 streamer=streamer,
                 lazy_mode=lazy_mode,
                 ignore_eos=generation_config.ignore_eos,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
+                profiler=profiler,
                 hb_gen_time=hb_gen_time,
-                profiling_record_shapes=profiling_record_shapes,
                 **model_kwargs,
             )
 
@@ -1785,10 +1774,8 @@ def generate(
                 generation_config=generation_config,
                 synced_gpus=synced_gpus,
                 lazy_mode=lazy_mode,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
+                profiler=profiler,
                 hb_gen_time=hb_gen_time,
-                profiling_record_shapes=profiling_record_shapes,
                 **model_kwargs,
             )
 
@@ -1820,10 +1807,8 @@ def generate(
                 generation_config=generation_config,
                 synced_gpus=synced_gpus,
                 lazy_mode=lazy_mode,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
+                profiler=profiler,
                 hb_gen_time=hb_gen_time,
-                profiling_record_shapes=profiling_record_shapes,
                 **model_kwargs,
             )
 
@@ -1895,10 +1880,8 @@ def typeerror():
                 generation_config=generation_config,
                 synced_gpus=synced_gpus,
                 lazy_mode=lazy_mode,
-                profiling_warmup_steps=profiling_warmup_steps,
-                profiling_steps=profiling_steps,
+                profiler=profiler,
                 hb_gen_time=hb_gen_time,
-                profiling_record_shapes=profiling_record_shapes,
                 **model_kwargs,
             )
 
@@ -1975,10 +1958,8 @@ def _contrastive_search(
         streamer: Optional["BaseStreamer"],
         lazy_mode: Optional[bool] = False,
         ignore_eos: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
+        profiler: Optional[HabanaProfile] = None,
         hb_gen_time: Optional[HabanaGenerationTime] = None,
-        profiling_record_shapes: Optional[bool] = False,
         **model_kwargs,
     ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
         r"""
@@ -2012,12 +1993,8 @@ def _contrastive_search(
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             ignore_eos (`bool`, *optional*, defaults to `False`):
                 Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
+            profiler (`HabanaProfile`, *optional*, defaults to None):
+                HabanaProfile object to use for profiling.
             model_kwargs:
                 Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                 If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -2065,10 +2042,9 @@ def _contrastive_search(
 
         this_peer_finished = False
 
-        hb_profer = HabanaProfile(
-            warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=profiling_record_shapes
-        )
-        hb_profer.start()
+        if profiler is not None:
+            profiler.start()
+
         bucket_size = model_kwargs.get("bucket_size", -1)
         prev_idx = -1  # avoiding calculate cache_idx when its value is not changing
         bucket_internal = model_kwargs.get("bucket_internal", None)
@@ -2511,7 +2487,9 @@ def _contrastive_search(
 
                     torch_hpu.synchronize()
                 hb_gen_time.step()
-            hb_profer.step()
+
+            if profiler is not None:
+                profiler.step()
 
         if (
             model_kwargs.get("use_hpu_graphs", False)
@@ -2524,7 +2502,9 @@ def _contrastive_search(
             # Delete past key value tensors
             self._remove_past_key_values(model_kwargs)
 
-        hb_profer.stop()
+        if profiler is not None:
+            profiler.stop()
+
         if streamer is not None:
             streamer.end()
 
@@ -2580,10 +2560,8 @@ def _sample(
         streamer: Optional["BaseStreamer"],
         lazy_mode: Optional[bool] = False,
         ignore_eos: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
+        profiler: Optional[HabanaProfile] = None,
         hb_gen_time: Optional[HabanaGenerationTime] = None,
-        profiling_record_shapes: Optional[bool] = False,
         **model_kwargs,
     ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
         r"""
@@ -2611,12 +2589,8 @@ def _sample(
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
             ignore_eos (`bool`, *optional*, defaults to `False`):
                 Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
+            profiler (`HabanaProfile`, *optional*, defaults to None):
+                HabanaProfile object to use for profiling.
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                 an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -2665,10 +2639,8 @@ def _sample(
         bucket_internal = model_kwargs.get("bucket_internal", None)
         reduce_recompile = model_kwargs.get("reduce_recompile", False)
 
-        hb_profer = HabanaProfile(
-            warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=profiling_record_shapes
-        )
-        hb_profer.start()
+        if profiler is not None:
+            profiler.start()
 
         if not bucket_internal:
             if bucket_size >= 0:
@@ -2839,7 +2811,9 @@ def _sample(
 
                     torch_hpu.synchronize()
                 hb_gen_time.step()
-            hb_profer.step()
+
+            if profiler is not None:
+                profiler.step()
 
             if (
                 not model_kwargs.get("pad_done", False)
@@ -2891,7 +2865,8 @@ def _sample(
             # Delete past key value tensors
             self._remove_past_key_values(model_kwargs)
 
-        hb_profer.stop()
+            if profiler is not None:
+                profiler.stop()
 
         if streamer is not None:
             streamer.end()
@@ -2944,10 +2919,8 @@ def _beam_search(
         generation_config: GaudiGenerationConfig,
         synced_gpus: bool,
         lazy_mode: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
+        profiler: Optional[HabanaProfile] = None,
         hb_gen_time: Optional[HabanaGenerationTime] = None,
-        profiling_record_shapes: Optional[bool] = False,
         **model_kwargs,
     ) -> Union[GenerateBeamOutput, torch.LongTensor]:
         r"""
@@ -2979,12 +2952,8 @@ def _beam_search(
                 `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
+            profiler (`HabanaProfile`, *optional*, defaults to None):
+                HabanaProfile object to use for profiling.
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                 an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -3172,10 +3141,9 @@ def expand_if_needed(tensor, new_size, value, dim=-1):
             input_ids = torch.stack(return_res)
             return input_ids
 
-        hb_profer = HabanaProfile(
-            warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=profiling_record_shapes
-        )
-        hb_profer.start()
+        if profiler is not None:
+            profiler.start()
+
         this_peer_finished = False
 
         bucket_size = model_kwargs.get("bucket_size", -1)
@@ -3385,7 +3353,9 @@ def expand_if_needed(tensor, new_size, value, dim=-1):
                 else:
                     model_kwargs["cache_idx"] = model_kwargs["kv_cache_len"]
 
-            hb_profer.step()
+            if profiler is not None:
+                profiler.step()
+
             if self.generation_config.static_shapes:
                 is_min_length_reached = (
                     self.generation_config.min_length and cur_len >= self.generation_config.min_length
@@ -3399,7 +3369,9 @@ def expand_if_needed(tensor, new_size, value, dim=-1):
             ):
                 this_peer_finished = True
 
-            hb_profer.step()
+            if profiler is not None:
+                profiler.step()
+
             if hb_gen_time is not None:
                 if not time_to_first_token_done:
                     time_to_first_token_done = True
@@ -3436,7 +3408,8 @@ def expand_if_needed(tensor, new_size, value, dim=-1):
             # Delete past key value tensors
             self._remove_past_key_values(model_kwargs)
 
-        hb_profer.stop()
+        if profiler is not None:
+            profiler.stop()
 
         if self.generation_config.static_shapes:
             beam_trace = (beam_trace_idx, beam_trace_scores, beam_trace_indices, beam_trace_tokens)
@@ -3515,10 +3488,8 @@ def _group_beam_search(
         generation_config: GaudiGenerationConfig,
         synced_gpus: bool,
         lazy_mode: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
+        profiler: Optional[HabanaProfile] = None,
         hb_gen_time: Optional[HabanaGenerationTime] = None,
-        profiling_record_shapes: Optional[bool] = False,
         **model_kwargs,
     ):
         r"""
@@ -3544,12 +3515,8 @@ def _group_beam_search(
                 `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
+            profiler (`HabanaProfile`, *optional*, defaults to None):
+                HabanaProfile object to use for profiling.
             model_kwargs:
                 Additional model specific kwargs that will be forwarded to the `forward` function of the model. If
                 model is an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -3573,10 +3540,8 @@ def _constrained_beam_search(
         generation_config: GaudiGenerationConfig,
         synced_gpus: bool,
         lazy_mode: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
+        profiler: Optional[HabanaProfile] = None,
         hb_gen_time: Optional[HabanaGenerationTime] = None,
-        profiling_record_shapes: Optional[bool] = False,
         **model_kwargs,
     ) -> Union[GenerateBeamOutput, torch.LongTensor]:
         r"""
@@ -3603,12 +3568,8 @@ def _constrained_beam_search(
                 `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
+            profiler (`HabanaProfile`, *optional*, defaults to None):
+                HabanaProfile object to use for profiling.
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                 an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -3677,10 +3638,8 @@ def _constrained_beam_search(
         else:
             decoder_prompt_len = input_ids.shape[-1]
 
-        hb_profer = HabanaProfile(
-            warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=profiling_record_shapes
-        )
-        hb_profer.start()
+        if profiler is not None:
+            profiler.start()
 
         time_to_first_token_done = False
         while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
@@ -3810,7 +3769,8 @@ def _constrained_beam_search(
             # increase cur_len
             cur_len = cur_len + 1
 
-            hb_profer.step()
+            if profiler is not None:
+                profiler.step()
 
             if constrained_beam_scorer.is_done or get_final_stopping_criteria(
                 stopping_criteria(input_ids, scores, token_idx=cur_len)
@@ -3825,7 +3785,9 @@ def _constrained_beam_search(
                     torch_hpu.synchronize()
                 hb_gen_time.step()
 
-        hb_profer.stop()
+        if profiler is not None:
+            profiler.stop()
+
         sequence_outputs = constrained_beam_scorer.finalize(
             input_ids,
             beam_scores,
@@ -3880,10 +3842,8 @@ def _assisted_decoding(
         streamer: Optional["BaseStreamer"],
         lazy_mode: Optional[bool] = False,
         ignore_eos: Optional[bool] = False,
-        profiling_warmup_steps: Optional[int] = 0,
-        profiling_steps: Optional[int] = 0,
+        profiler: Optional[HabanaProfile] = None,
         hb_gen_time: Optional[HabanaGenerationTime] = None,
-        profiling_record_shapes: Optional[bool] = False,
         **model_kwargs,
     ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
         r"""
@@ -3914,12 +3874,8 @@ def _assisted_decoding(
                 through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
             lazy_mode (`bool`, *optional*, defaults to `False`):
                 Whether the run is executed in lazy mode or not (i.e. eager mode).
-            profiling_warmup_steps (`int`, *optional*, defaults to 0):
-                Number of steps to ignore for profling.
-            profiling_steps (`int`, *optional*, defaults to 0):
-                Number of steps to be captured when enabling profiling.
-            profiling_record_shapes (`bool`, *optional*, defaults to False):
-                Record shapes when enabling profiling.
+            profiler (`HabanaProfile`, *optional*, defaults to None):
+                HabanaProfile object to use for profiling.
             model_kwargs:
                 Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                 If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
@@ -3959,8 +3915,9 @@ def _assisted_decoding(
             unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
         model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
 
-        hb_profer = HabanaProfile(warmup=profiling_warmup_steps, active=profiling_steps)
-        hb_profer.start()
+        if profiler is not None:
+            profiler.start()
+
         this_peer_finished = False
         is_first_iteration = True  # to preserve the same API in the output as other generation methods
 
@@ -4157,12 +4114,16 @@ def _assisted_decoding(
 
                     torch_hpu.synchronize()
                 hb_gen_time.step()
-            hb_profer.step()
+
+            if profiler is not None:
+                profiler.step()
 
             if this_peer_finished and not synced_gpus:
                 break
 
-        hb_profer.stop()
+        if profiler is not None:
+            profiler.stop()
+
         if streamer is not None:
             streamer.end()
 
diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py
index 1bb6a1f570..28180a56ee 100644
--- a/optimum/habana/transformers/trainer.py
+++ b/optimum/habana/transformers/trainer.py
@@ -918,6 +918,7 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args, use_reentrant: Optio
             active=self.args.profiling_steps,
             record_shapes=self.args.profiling_record_shapes,
             with_stack=self.args.profiling_with_stack,
+            name="train",
         )
         hb_profiler.start()
 
@@ -1987,6 +1988,15 @@ def evaluation_loop(
         # set a default dtype of logits
         logits_dtype: str = "float32"
 
+        hb_profiler = HabanaProfile(
+            warmup=self.args.profiling_warmup_steps_eval,
+            active=self.args.profiling_steps_eval,
+            record_shapes=self.args.profiling_record_shapes,
+            with_stack=self.args.profiling_with_stack,
+            name=description.lower(),
+        )
+        hb_profiler.start()
+
         # Main evaluation loop
         start_time_eval = time.time()
         for step, inputs in enumerate(dataloader):
@@ -2077,6 +2087,10 @@ def evaluation_loop(
             if args.use_lazy_mode:
                 self.htcore.mark_step()
 
+            hb_profiler.step()
+
+        hb_profiler.stop()
+
         # After all calls to `.gather_function`, reset to `gather_for_metrics`:
         self.gather_function = self.accelerator.gather_for_metrics
         if args.past_index and hasattr(self, "_past"):
diff --git a/optimum/habana/transformers/training_args.py b/optimum/habana/transformers/training_args.py
index 1cd7b0305e..b7f9a6ff40 100644
--- a/optimum/habana/transformers/training_args.py
+++ b/optimum/habana/transformers/training_args.py
@@ -136,9 +136,13 @@ class GaudiTrainingArguments(TrainingArguments):
         non_blocking_data_copy (`bool`, *optional*, defaults to `False`):
             Whether to enable async data copy when preparing inputs.
         profiling_warmup_steps (`int`, *optional*, defaults to 0):
-            Number of steps to ignore for profiling.
+            Number of training steps to ignore for profiling.
         profiling_steps (`int`, *optional*, defaults to 0):
-            Number of steps to be captured when enabling profiling.
+            Number of training steps to be captured when enabling profiling.
+        profiling_warmup_steps_eval (`int`, *optional*, defaults to 0):
+            Number of eval steps to ignore for profiling.
+        profiling_steps_eval (`int`, *optional*, defaults to 0):
+            Number of eval steps to be captured when enabling profiling.
     """
 
     use_habana: Optional[bool] = field(
@@ -293,12 +297,22 @@ class GaudiTrainingArguments(TrainingArguments):
 
     profiling_warmup_steps: Optional[int] = field(
         default=0,
-        metadata={"help": ("Number of steps to ignore for profiling.")},
+        metadata={"help": ("Number of training steps to ignore for profiling.")},
     )
 
     profiling_steps: Optional[int] = field(
         default=0,
-        metadata={"help": ("Number of steps to be captured when enabling profiling.")},
+        metadata={"help": ("Number of training steps to be captured when enabling profiling.")},
+    )
+
+    profiling_warmup_steps_eval: Optional[int] = field(
+        default=0,
+        metadata={"help": ("Number of eval steps to ignore for profiling.")},
+    )
+
+    profiling_steps_eval: Optional[int] = field(
+        default=0,
+        metadata={"help": ("Number of eval steps to be captured when enabling profiling.")},
     )
 
     profiling_record_shapes: Optional[bool] = field(
diff --git a/optimum/habana/utils.py b/optimum/habana/utils.py
index 625128fcb6..74f42f909b 100755
--- a/optimum/habana/utils.py
+++ b/optimum/habana/utils.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import random
 import subprocess
 import time
@@ -290,12 +291,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         self.step()
 
 
-class HabanaProfile(object):
-    """
-    HPU profiler only could be run once, so HABANA_PROFILE_ENABLED, a class static variable shared by all the instances of HabanaProfile, is used to control which part will be captured.
-    """
-
-    HABANA_PROFILE_ENABLED = True
+class HabanaProfile:
+    _profilers = []
 
     def __init__(
         self,
@@ -303,65 +300,44 @@ def __init__(
         active: int = 0,
         record_shapes: bool = True,
         with_stack: bool = False,
+        name: str = "",
         output_dir: str = "./hpu_profile",
         wait: int = 0,
     ):
-        if active <= 0 or warmup < 0 or not HabanaProfile.HABANA_PROFILE_ENABLED:
+        self._profiler = None
+        self._running = False
 
-            def noop():
-                pass
+        if active <= 0:
+            self.start = self.stop = self.step = lambda: None
 
-            self.start = noop
-            self.stop = noop
-            self.step = noop
         else:
-            HabanaProfile.HABANA_PROFILE_ENABLED = False
+            output_dir = os.path.join(output_dir, name)
+
             schedule = torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1)
             activities = [torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.HPU]
-
-            profiler = torch.profiler.profile(
+            self._profiler = torch.profiler.profile(
                 schedule=schedule,
                 activities=activities,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(output_dir),
                 record_shapes=record_shapes,
                 with_stack=with_stack,
             )
-            self.start = profiler.start
-            self.stop = profiler.stop
-            self.step = profiler.step
-            HabanaProfile.enable.invalid = True
-            HabanaProfile.disable.invalid = True
-
-    def stop(self):
-        self.stop()
+            self._profilers.append(self)
 
     def start(self):
-        self.start()
+        if any(p._running for p in self._profilers):
+            raise RuntimeError("Cannot start profiler, another profiler instance is already running")
+        self._running = True
+        self._profiler.start()
 
-    def step(self):
-        self.step()
+    def stop(self):
+        if self._running:
+            self._profiler.stop()
+            self._running = False
 
-    @staticmethod
-    def disable():
-        """
-        Runs only once and must happen before doing profiling.
-        """
-        if hasattr(HabanaProfile.disable, "invalid"):
-            if not HabanaProfile.disable.invalid:
-                HabanaProfile.HABANA_PROFILE_ENABLED = False
-        else:
-            HabanaProfile.HABANA_PROFILE_ENABLED = False
-
-    @staticmethod
-    def enable():
-        """
-        Runs only once and must happen before doing profiling.
-        """
-        if hasattr(HabanaProfile.enable, "invalid"):
-            if not HabanaProfile.enable.invalid:
-                HabanaProfile.HABANA_PROFILE_ENABLED = True
-        else:
-            HabanaProfile.HABANA_PROFILE_ENABLED = True
+    def step(self):
+        if self._running:
+            self._profiler.step()
 
 
 def check_optimum_habana_min_version(min_version):
diff --git a/tests/test_habana_profiler_integration.py b/tests/test_habana_profiler_integration.py
new file mode 100644
index 0000000000..cb17bfba5c
--- /dev/null
+++ b/tests/test_habana_profiler_integration.py
@@ -0,0 +1,120 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import subprocess
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import pytest
+
+
+@pytest.fixture
+def oh_path():
+    cwd = Path.cwd()
+    if cwd.name.startswith("optimum-habana"):
+        oh_path = cwd
+    for parent in cwd.parents:
+        if parent.name.startswith("optimum-habana"):
+            oh_path = parent
+    return oh_path.resolve()
+
+
+@pytest.fixture
+def profiling_dir(oh_path):
+    p = oh_path / "hpu_profile"
+    yield p
+    if p.exists():
+        shutil.rmtree(p)
+
+
+@pytest.fixture
+def temp_dir():
+    td = TemporaryDirectory()
+    yield td.name
+    td.cleanup()
+
+
+def install_requirements(requirements_file_path):
+    print(f"Installing {requirements_file_path}")
+    p = subprocess.run(f"pip install -r {requirements_file_path}", shell=True)
+    assert p.returncode == 0, f"Failed to install {requirements_file_path}"
+
+
+def run_command_and_check_profiler_output(command, expected_directories, expected_num_files):
+    print(f"\nRunning command: {command}")
+    p = subprocess.run(command, shell=True)
+    rc = p.returncode
+    stdout = "" if p.stdout is None else p.stdout.decode()
+    stderr = "" if p.stderr is None else p.stderr.decode()
+    if rc != 0:
+        msg = f"Command failed with return code {rc}\nstdout: {stdout}\nstderr: {stderr}"
+    assert rc == 0, msg
+
+    for expected_dir in expected_directories:
+        assert expected_dir.exists(), f"No profiling directory {expected_dir}"
+        assert len(list(expected_dir.glob("*.json"))) == expected_num_files
+
+
+def test_integration_train_and_eval(oh_path, profiling_dir, temp_dir):
+    command = (
+        f"python3 {oh_path}/examples/text-classification/run_glue.py "
+        "--model_name_or_path bert-large-uncased-whole-word-masking "
+        "--gaudi_config_name Habana/bert-large-uncased-whole-word-masking "
+        f"--task_name mrpc --do_train --output_dir {temp_dir} "
+        "--overwrite_output_dir --learning_rate 3e-05 "
+        "--per_device_train_batch_size 1 --per_device_eval_batch_size 1 "
+        "--num_train_epochs 1 --use_habana --throughput_warmup_steps 1 "
+        "--save_strategy no --use_lazy_mode --do_eval --max_seq_length 128 "
+        "--use_hpu_graphs_for_inference --sdp_on_bf16 --profiling_steps 1 "
+        "--profiling_warmup_steps 1 --profiling_steps_eval 1 "
+        "--profiling_warmup_steps_eval 1"
+    )
+    install_requirements(f"{oh_path}/examples/text-classification/requirements.txt")
+    expected_dirs = [
+        profiling_dir / "train",
+        profiling_dir / "evaluation",
+    ]
+    run_command_and_check_profiler_output(command, expected_dirs, expected_num_files=1)
+
+
+def test_integration_text_generation(oh_path, profiling_dir, temp_dir):
+    command = (
+        f"python3 {oh_path}/examples/text-generation/run_generation.py "
+        "--model_name_or_path bigscience/bloomz-7b1 --batch_size 1 --use_kv_cache "
+        f"--max_new_tokens 100 --use_hpu_graphs --bf16 --output_dir {temp_dir} "
+        "--profiling_steps 1 --profiling_warmup_steps 1"
+    )
+    install_requirements(f"{oh_path}/examples/text-generation/requirements.txt")
+    expected_dirs = [profiling_dir / "generate"]
+    run_command_and_check_profiler_output(command, expected_dirs, expected_num_files=1)
+
+
+@pytest.mark.x8
+def test_integration_stable_diffusion(oh_path, profiling_dir, temp_dir):
+    world_size = 8
+    command = (
+        f"python {oh_path}/examples/gaudi_spawn.py --world_size {world_size} "
+        f"{oh_path}/examples/stable-diffusion/text_to_image_generation.py "
+        "--model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 "
+        '--prompts "Sailing ship painting by Van Gogh" --num_images_per_prompt 1 '
+        f"--batch_size 1 --image_save_dir {temp_dir} --scheduler euler_discrete "
+        "--use_habana --use_hpu_graphs --gaudi_config Habana/stable-diffusion --bf16 "
+        "--num_inference_steps 10 --optimize --sdp_on_bf16 "
+        "--profiling_steps 1 --profiling_warmup_steps 1 --distributed"
+    )
+    install_requirements(f"{oh_path}/examples/stable-diffusion/requirements.txt")
+    expected_dirs = [profiling_dir / "stable_diffusion"]
+    run_command_and_check_profiler_output(command, expected_dirs, expected_num_files=world_size)
diff --git a/tests/test_habana_profiler_unit.py b/tests/test_habana_profiler_unit.py
new file mode 100644
index 0000000000..646e604866
--- /dev/null
+++ b/tests/test_habana_profiler_unit.py
@@ -0,0 +1,125 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+from unittest.mock import MagicMock
+
+import pytest
+
+from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+from optimum.habana.utils import HabanaProfile
+
+
+adapt_transformers_to_gaudi()
+
+
+PROFILER_OUTPUT_DIR = "./hpu_profile"
+
+
+@pytest.fixture
+def patched_profiler(monkeypatch):
+    p = HabanaProfile(warmup=1, active=1)
+    mock_start = MagicMock()
+    mock_stop = MagicMock()
+    mock_step = MagicMock()
+    monkeypatch.setattr(p._profiler, "start", mock_start)
+    monkeypatch.setattr(p._profiler, "stop", mock_stop)
+    monkeypatch.setattr(p._profiler, "step", mock_step)
+    yield p
+
+
+@pytest.fixture(autouse=True)
+def cleanup():
+    shutil.rmtree(PROFILER_OUTPUT_DIR, ignore_errors=True)
+    HabanaProfile._profilers = []
+
+
+def run_profiling(profiler):
+    profiler.start()
+    for _ in range(2):
+        profiler.step()
+    profiler.stop()
+
+
+def test_init_profiler_with_no_steps():
+    profiler = HabanaProfile()
+    assert profiler._profiler is None
+    assert profiler.start() is None
+    assert not profiler._running
+    assert profiler.step() is None
+    assert profiler.stop() is None
+
+
+def test_init_profiler_with_steps(patched_profiler):
+    assert not patched_profiler._running
+    assert patched_profiler._profiler is not None
+
+
+def test_start_profiling(patched_profiler):
+    patched_profiler.start()
+    assert patched_profiler._running
+    patched_profiler._profiler.start.assert_called_once()
+
+
+def test_call_step_on_profiler(patched_profiler):
+    patched_profiler.start()
+    patched_profiler.step()
+    assert patched_profiler._running
+    patched_profiler._profiler.step.assert_called_once()
+
+
+def test_stop_profiling(patched_profiler):
+    patched_profiler.start()
+    patched_profiler.stop()
+    assert not patched_profiler._running
+    patched_profiler._profiler.stop.assert_called_once()
+
+
+def test_profiler_files():
+    profiler = HabanaProfile(warmup=1, active=1)
+    run_profiling(profiler)
+    assert os.path.exists(PROFILER_OUTPUT_DIR)
+    assert len(os.listdir(PROFILER_OUTPUT_DIR)) == 1
+
+
+def test_profiler_with_name():
+    profiler = HabanaProfile(warmup=1, active=1, name="test")
+    run_profiling(profiler)
+    expected_dir = os.path.join(PROFILER_OUTPUT_DIR, "test")
+    assert os.path.exists(expected_dir)
+    assert len(os.listdir(expected_dir)) == 1
+
+
+def test_profiler_with_no_steps_doesnt_run():
+    profiler = HabanaProfile()
+    run_profiling(profiler)
+    assert not os.path.exists(PROFILER_OUTPUT_DIR)
+
+
+def test_two_profilers_can_run_sequentially():
+    profiler_0 = HabanaProfile(warmup=1, active=1)
+    run_profiling(profiler_0)
+    profiler_1 = HabanaProfile(warmup=1, active=1)
+    run_profiling(profiler_1)
+    assert os.path.exists(PROFILER_OUTPUT_DIR)
+    assert len(os.listdir(PROFILER_OUTPUT_DIR)) == 2
+
+
+def test_cannot_start_profiler_when_another_is_running(patched_profiler):
+    another_profiler = HabanaProfile(warmup=1, active=1)
+    patched_profiler.start()
+    with pytest.raises(RuntimeError):
+        another_profiler.start()