diff --git a/Makefile b/Makefile index e93f23296a..8de3d66c05 100644 --- a/Makefile +++ b/Makefile @@ -36,7 +36,7 @@ style: clean # Run unit and integration tests fast_tests: python -m pip install .[tests] - python -m pytest tests/test_gaudi_configuration.py tests/test_trainer_distributed.py tests/test_trainer.py tests/test_trainer_seq2seq.py + python -m pytest tests/test_gaudi_configuration.py tests/test_trainer_distributed.py tests/test_trainer.py tests/test_trainer_seq2seq.py tests/test_habana_profiler_unit.py # TODO enable when CI has more servers # python -m pytest test_functional_text_generation_example.py @@ -89,11 +89,15 @@ slow_tests_1x: test_installs python -m pip install peft==0.10.0; \ python -m pytest tests/test_peft_inference.py || status2=$$?; \ python -m pytest tests/test_pipeline.py || status3=$$?; \ - exit $$((status1 + status2 + status3)) + python -m pytest tests/test_habana_profiler_integration.py -v -s -m "not x8" || status4=$$?; \ + exit $$((status1 + status2 + status3 + status4)) # Run multi-card non-regression tests slow_tests_8x: test_installs - DATA_CACHE=$(DATA_CACHE) python -m pytest tests/test_examples.py -v -s -k "multi_card" + @status1=0; status2=0; \ + DATA_CACHE=$(DATA_CACHE) python -m pytest tests/test_examples.py -v -s -k "multi_card" || status1=$$?; \ + python -m pytest tests/test_habana_profiler_integration.py -v -s -m x8 || status2=$$?; \ + exit $$((status1 + status2)) # Run DeepSpeed non-regression tests slow_tests_deepspeed: test_installs diff --git a/examples/stable-diffusion/training/train_text_to_image_sdxl.py b/examples/stable-diffusion/training/train_text_to_image_sdxl.py index d585e91022..7c1c87d14b 100755 --- a/examples/stable-diffusion/training/train_text_to_image_sdxl.py +++ b/examples/stable-diffusion/training/train_text_to_image_sdxl.py @@ -547,13 +547,25 @@ def parse_args(input_args=None): "--profiling_warmup_steps", default=0, type=int, - help="Number of steps to ignore for profiling.", + help="Number of training steps to ignore for profiling.", ) parser.add_argument( "--profiling_steps", default=0, type=int, - help="Number of steps to capture for profiling.", + help="Number of training steps to capture for profiling.", + ) + parser.add_argument( + "--profiling_warmup_steps_eval", + default=0, + type=int, + help="Number of inference steps to ignore for profiling.", + ) + parser.add_argument( + "--profiling_steps_eval", + default=0, + type=int, + help="Number of inference steps to capture for profiling.", ) parser.add_argument( "--logging_step", @@ -1153,9 +1165,7 @@ def unwrap_model(model, training=False): unwrap_model(model=unet, training=True) hb_profiler = HabanaProfile( - warmup=args.profiling_warmup_steps, - active=args.profiling_steps, - record_shapes=False, + warmup=args.profiling_warmup_steps, active=args.profiling_steps, record_shapes=False, name="train" ) # Train! total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps @@ -1521,6 +1531,8 @@ def compute_time_ids(original_size, crops_coords_top_left): args.validation_prompt, num_inference_steps=25, generator=generator, + profiling_warmup_steps=args.profiling_warmup_steps_eval, + profiling_steps=args.profiling_steps_eval, ).images[0] for _ in range(args.num_validation_images) ] diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py old mode 100644 new mode 100755 index ad0e168764..cb3caf3ff2 --- a/examples/text-generation/run_generation.py +++ b/examples/text-generation/run_generation.py @@ -157,6 +157,11 @@ def setup_parser(parser): action="store_true", help="Record shapes when enabling profiling.", ) + parser.add_argument( + "--profile_whole_sequences", + action="store_true", + help="When set, profiling step means generation of one whole sequence (not one token).", + ) parser.add_argument( "--prompt", default=None, @@ -486,10 +491,24 @@ def main(): import habana_frameworks.torch.hpu as torch_hpu + from optimum.habana.utils import HabanaGenerationTime, HabanaProfile, get_hpu_memory_stats + if args.sdp_on_bf16: torch._C._set_math_sdp_allow_fp16_bf16_reduction(True) - from optimum.habana.utils import HabanaGenerationTime, get_hpu_memory_stats + active_profiler = HabanaProfile( + warmup=args.profiling_warmup_steps, + active=args.profiling_steps, + record_shapes=args.profiling_record_shapes, + name="generate", + ) + disabled_profiler = HabanaProfile() + if args.profile_whole_sequences: + per_sequence_profiler = active_profiler + per_token_profiler = disabled_profiler + else: + per_sequence_profiler = disabled_profiler + per_token_profiler = active_profiler if args.dataset_name == "mlcommons": # Benchmark over the prompts below @@ -708,8 +727,9 @@ def assemble_prompt(prompt_size, book_path): elif args.batch_size < len(input_sentences): input_sentences = input_sentences[: args.batch_size] - def generate(size=None, reduce_recompile=False): + def generate(size=None, reduce_recompile=False, disable_profiling=False): """Generates sequences from the input sentences and returns them.""" + profiler = disabled_profiler if disable_profiling else per_token_profiler timer = HabanaGenerationTime() timer.start() # Tokenization @@ -770,11 +790,9 @@ def compute_valid_sequence_lengths_tensor(input_tokens): assistant_model=assistant_model, lazy_mode=use_lazy_mode, hpu_graphs=args.use_hpu_graphs, - profiling_steps=args.profiling_steps, - profiling_warmup_steps=args.profiling_warmup_steps, ignore_eos=args.ignore_eos, iteration_times=iteration_times, - profiling_record_shapes=args.profiling_record_shapes, + profiler=profiler, ).cpu() timer.step() first_token_time = iteration_times[0] + encode_duration @@ -790,10 +808,6 @@ def compute_valid_sequence_lengths_tensor(input_tokens): e2e_latency, ) - from optimum.habana.utils import HabanaProfile - - # compilation stage disable profiling - HabanaProfile.disable() # Compilation logger.info("Graph compilation...") dyn_prompt_lens = args.simulate_dyn_prompt @@ -804,10 +818,10 @@ def compute_valid_sequence_lengths_tensor(input_tokens): for i in range(args.warmup): if dyn_prompt_lens is None: print(f"Warming up iteration {i + 1}/{args.warmup}", flush=True) - generate(None, args.reduce_recompile) + generate(None, args.reduce_recompile, disable_profiling=True) else: print(f"Warming up for shape {dyn_prompt_lens[0]} iteration {i + 1}/{args.warmup}", flush=True) - generate(dyn_prompt_lens[0], args.reduce_recompile) + generate(dyn_prompt_lens[0], args.reduce_recompile, disable_profiling=True) else: if args.bucket_size > 0: mn = min(dyn_prompt_lens) @@ -822,11 +836,10 @@ def rounder(x): lst = list(range(min_prompt_len, max_sentence_len + 1, args.bucket_size)) for sz in lst: print(f"Warming up for shape {sz - 1} iteration {i + 1}/{args.warmup}", flush=True) - generate(sz - 1, args.reduce_recompile) + generate(sz - 1, args.reduce_recompile, disable_profiling=True) torch_hpu.synchronize() timer.step() compilation_duration = timer.last_duration - HabanaProfile.enable() total_new_tokens_generated = 0 logger.info("Running generate...") first_token_latencies = [] @@ -834,12 +847,14 @@ def rounder(x): e2e_latencies = [] timer.step() # Benchmark over n_iterations iterations + per_sequence_profiler.start() if dyn_prompt_lens is None: for i in range(args.n_iterations): generated, first_token_time, rest_token_time, e2e_latency = generate(None, args.reduce_recompile) first_token_latencies.append(first_token_time) rest_token_latencies.append(rest_token_time) e2e_latencies.append(e2e_latency) + per_sequence_profiler.step() else: repeated_prompt_len = cycle(dyn_prompt_lens) for i in range(args.n_iterations): @@ -849,9 +864,11 @@ def rounder(x): first_token_latencies.append(first_token_time) rest_token_latencies.append(rest_token_time) e2e_latencies.append(e2e_latency) + per_sequence_profiler.step() timer.step() logger.info("Finished running generate") duration = timer.last_duration + per_sequence_profiler.stop() total_new_tokens_generated = args.n_iterations * args.batch_size * args.max_new_tokens throughput = total_new_tokens_generated / duration # Calculate average latencies @@ -983,7 +1000,9 @@ def collate_fn(data): dataloader = DataLoader(raw_dataset, batch_size=args.batch_size, collate_fn=collate_fn) - def generate_dataset(batch): + def generate_dataset(batch, disable_profiling=False): + profiler = disabled_profiler if disable_profiling else per_token_profiler + prompt = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True) # Move inputs to target device(s) for t in batch: @@ -995,18 +1014,11 @@ def generate_dataset(batch): generation_config=generation_config, lazy_mode=use_lazy_mode, hpu_graphs=args.use_hpu_graphs, - profiling_steps=args.profiling_steps, - profiling_warmup_steps=args.profiling_warmup_steps, ignore_eos=args.ignore_eos, - profiling_record_shapes=args.profiling_record_shapes, + profiler=profiler, ).cpu() return prompt, outputs - # warmup - from optimum.habana.utils import HabanaProfile - - # compilation stage disable profiling - HabanaProfile.disable() # Compilation logger.info("Graph compilation...") timer = HabanaGenerationTime() @@ -1022,14 +1034,15 @@ def generate_dataset(batch): torch_hpu.synchronize() timer.step() compilation_duration = timer.last_duration - HabanaProfile.enable() - total_new_tokens_generated = 0 duration = 0 separator = "-" * 50 logger.info("Running generate dataset...") + timer = HabanaGenerationTime() timer.start() + per_sequence_profiler.start() + for i, batch in enumerate(dataloader): timer.step() prompt, outputs = generate_dataset(batch) @@ -1045,7 +1058,9 @@ def generate_dataset(batch): print(separator) if args.run_partial_dataset and args.n_iterations == i + 1: break + per_sequence_profiler.step() timer.step() + per_sequence_profiler.stop() throughput = total_new_tokens_generated / duration # Print Stats diff --git a/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py b/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py index 56714da448..4539b5b822 100644 --- a/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -500,6 +500,7 @@ def __call__( warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=False, + name="diffuser_pipeline", ) hb_profiler.start() diff --git a/optimum/habana/diffusers/pipelines/flux/pipeline_flux.py b/optimum/habana/diffusers/pipelines/flux/pipeline_flux.py index 4caaf5bfe8..38908696b9 100644 --- a/optimum/habana/diffusers/pipelines/flux/pipeline_flux.py +++ b/optimum/habana/diffusers/pipelines/flux/pipeline_flux.py @@ -470,6 +470,7 @@ def __call__( warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=False, + name="diffuser_pipeline", ) hb_profiler.start() diff --git a/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py b/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py index ee2be57274..9a2f91bcff 100644 --- a/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py +++ b/optimum/habana/diffusers/pipelines/flux/pipeline_flux_img2img.py @@ -502,6 +502,7 @@ def __call__( warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=False, + name="diffuser_pipeline", ) hb_profiler.start() diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 7efe1059bc..d1537582bc 100644 --- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -494,6 +494,7 @@ def __call__( warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=False, + name="stable_diffusion", ) hb_profiler.start() diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py index 7cd8d23ade..be361f9186 100644 --- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py @@ -309,6 +309,7 @@ def __call__( warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=False, + name="stable_diffusion", ) hb_profiler.start() diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 3086b23c0c..22aa954682 100644 --- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -513,6 +513,7 @@ def __call__( warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=False, + name="stable_diffusion", ) hb_profiler.start() diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index a2a7ec1399..0794927a3b 100644 --- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -392,6 +392,7 @@ def __call__( warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=False, + name="stable_diffusion", ) hb_profiler.start() diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/optimum/habana/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py index 1565562936..61c46cf37f 100644 --- a/optimum/habana/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +++ b/optimum/habana/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py @@ -542,6 +542,7 @@ def __call__( warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=False, + name="stable_diffusion", ) hb_profiler.start() diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index 610f8eabba..369ede5834 100644 --- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -662,6 +662,7 @@ def __call__( warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=False, + name="stable_diffusion", ) hb_profiler.start() diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index 6846e1a146..cd9550de20 100644 --- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -502,6 +502,7 @@ def denoising_value_valid(dnv): warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=False, + name="stable_diffusion", ) hb_profiler.start() diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_mlperf.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_mlperf.py index e9285fe4b8..edebec1778 100644 --- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_mlperf.py +++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_mlperf.py @@ -756,6 +756,7 @@ def __call__( warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=False, + name="stable_diffusion", ) hb_profiler.start() diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py index c5097d15ca..e92b07d6c9 100755 --- a/optimum/habana/transformers/generation/utils.py +++ b/optimum/habana/transformers/generation/utils.py @@ -1132,10 +1132,8 @@ def generate( use_model_defaults: Optional[bool] = None, lazy_mode: Optional[bool] = False, hpu_graphs: Optional[bool] = False, - profiling_warmup_steps: Optional[int] = 0, - profiling_steps: Optional[int] = 0, iteration_times: Optional[List[float]] = None, - profiling_record_shapes: Optional[bool] = False, + profiler: Optional[HabanaProfile] = None, **kwargs, ) -> Union[GenerateOutput, torch.LongTensor]: r""" @@ -1210,12 +1208,8 @@ def generate( Whether the run is executed in lazy mode or not (i.e. eager mode). hpu_graphs (`bool`, *optional*, defaults to `False`): Whether to use HPU graphs for inference. - profiling_warmup_steps (`int`, *optional*, defaults to 0): - Number of steps to ignore for profling. - profiling_steps (`int`, *optional*, defaults to 0): - Number of steps to be captured when enabling profiling. - profiling_record_shapes (`bool`, *optional*, defaults to False): - Record shapes when enabling profiling. + profiler (`HabanaProfile`, *optional*, defaults to None): + HabanaProfile object to use for profiling. kwargs (`Dict[str, Any]`, *optional*): Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder @@ -1683,8 +1677,7 @@ def generate( streamer=streamer, lazy_mode=lazy_mode, ignore_eos=generation_config.ignore_eos, - profiling_warmup_steps=profiling_warmup_steps, - profiling_steps=profiling_steps, + profiler=profiler, hb_gen_time=hb_gen_time, **model_kwargs, ) @@ -1723,10 +1716,8 @@ def generate( streamer=streamer, lazy_mode=lazy_mode, ignore_eos=generation_config.ignore_eos, - profiling_warmup_steps=profiling_warmup_steps, - profiling_steps=profiling_steps, + profiler=profiler, hb_gen_time=hb_gen_time, - profiling_record_shapes=profiling_record_shapes, **model_kwargs, ) @@ -1749,10 +1740,8 @@ def generate( streamer=streamer, lazy_mode=lazy_mode, ignore_eos=generation_config.ignore_eos, - profiling_warmup_steps=profiling_warmup_steps, - profiling_steps=profiling_steps, + profiler=profiler, hb_gen_time=hb_gen_time, - profiling_record_shapes=profiling_record_shapes, **model_kwargs, ) @@ -1785,10 +1774,8 @@ def generate( generation_config=generation_config, synced_gpus=synced_gpus, lazy_mode=lazy_mode, - profiling_warmup_steps=profiling_warmup_steps, - profiling_steps=profiling_steps, + profiler=profiler, hb_gen_time=hb_gen_time, - profiling_record_shapes=profiling_record_shapes, **model_kwargs, ) @@ -1820,10 +1807,8 @@ def generate( generation_config=generation_config, synced_gpus=synced_gpus, lazy_mode=lazy_mode, - profiling_warmup_steps=profiling_warmup_steps, - profiling_steps=profiling_steps, + profiler=profiler, hb_gen_time=hb_gen_time, - profiling_record_shapes=profiling_record_shapes, **model_kwargs, ) @@ -1895,10 +1880,8 @@ def typeerror(): generation_config=generation_config, synced_gpus=synced_gpus, lazy_mode=lazy_mode, - profiling_warmup_steps=profiling_warmup_steps, - profiling_steps=profiling_steps, + profiler=profiler, hb_gen_time=hb_gen_time, - profiling_record_shapes=profiling_record_shapes, **model_kwargs, ) @@ -1975,10 +1958,8 @@ def _contrastive_search( streamer: Optional["BaseStreamer"], lazy_mode: Optional[bool] = False, ignore_eos: Optional[bool] = False, - profiling_warmup_steps: Optional[int] = 0, - profiling_steps: Optional[int] = 0, + profiler: Optional[HabanaProfile] = None, hb_gen_time: Optional[HabanaGenerationTime] = None, - profiling_record_shapes: Optional[bool] = False, **model_kwargs, ) -> Union[GenerateNonBeamOutput, torch.LongTensor]: r""" @@ -2012,12 +1993,8 @@ def _contrastive_search( Whether the run is executed in lazy mode or not (i.e. eager mode). ignore_eos (`bool`, *optional*, defaults to `False`): Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode). - profiling_warmup_steps (`int`, *optional*, defaults to 0): - Number of steps to ignore for profling. - profiling_steps (`int`, *optional*, defaults to 0): - Number of steps to be captured when enabling profiling. - profiling_record_shapes (`bool`, *optional*, defaults to False): - Record shapes when enabling profiling. + profiler (`HabanaProfile`, *optional*, defaults to None): + HabanaProfile object to use for profiling. model_kwargs: Additional model specific keyword arguments will be forwarded to the `forward` function of the model. If model is an encoder-decoder model the kwargs should include `encoder_outputs`. @@ -2065,10 +2042,9 @@ def _contrastive_search( this_peer_finished = False - hb_profer = HabanaProfile( - warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=profiling_record_shapes - ) - hb_profer.start() + if profiler is not None: + profiler.start() + bucket_size = model_kwargs.get("bucket_size", -1) prev_idx = -1 # avoiding calculate cache_idx when its value is not changing bucket_internal = model_kwargs.get("bucket_internal", None) @@ -2511,7 +2487,9 @@ def _contrastive_search( torch_hpu.synchronize() hb_gen_time.step() - hb_profer.step() + + if profiler is not None: + profiler.step() if ( model_kwargs.get("use_hpu_graphs", False) @@ -2524,7 +2502,9 @@ def _contrastive_search( # Delete past key value tensors self._remove_past_key_values(model_kwargs) - hb_profer.stop() + if profiler is not None: + profiler.stop() + if streamer is not None: streamer.end() @@ -2580,10 +2560,8 @@ def _sample( streamer: Optional["BaseStreamer"], lazy_mode: Optional[bool] = False, ignore_eos: Optional[bool] = False, - profiling_warmup_steps: Optional[int] = 0, - profiling_steps: Optional[int] = 0, + profiler: Optional[HabanaProfile] = None, hb_gen_time: Optional[HabanaGenerationTime] = None, - profiling_record_shapes: Optional[bool] = False, **model_kwargs, ) -> Union[GenerateNonBeamOutput, torch.LongTensor]: r""" @@ -2611,12 +2589,8 @@ def _sample( Whether the run is executed in lazy mode or not (i.e. eager mode). ignore_eos (`bool`, *optional*, defaults to `False`): Whether to ignore finished sequences (faster in lazy mode and with HPU graphs) or not (eager mode). - profiling_warmup_steps (`int`, *optional*, defaults to 0): - Number of steps to ignore for profling. - profiling_steps (`int`, *optional*, defaults to 0): - Number of steps to be captured when enabling profiling. - profiling_record_shapes (`bool`, *optional*, defaults to False): - Record shapes when enabling profiling. + profiler (`HabanaProfile`, *optional*, defaults to None): + HabanaProfile object to use for profiling. model_kwargs: Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is an encoder-decoder model the kwargs should include `encoder_outputs`. @@ -2665,10 +2639,8 @@ def _sample( bucket_internal = model_kwargs.get("bucket_internal", None) reduce_recompile = model_kwargs.get("reduce_recompile", False) - hb_profer = HabanaProfile( - warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=profiling_record_shapes - ) - hb_profer.start() + if profiler is not None: + profiler.start() if not bucket_internal: if bucket_size >= 0: @@ -2839,7 +2811,9 @@ def _sample( torch_hpu.synchronize() hb_gen_time.step() - hb_profer.step() + + if profiler is not None: + profiler.step() if ( not model_kwargs.get("pad_done", False) @@ -2891,7 +2865,8 @@ def _sample( # Delete past key value tensors self._remove_past_key_values(model_kwargs) - hb_profer.stop() + if profiler is not None: + profiler.stop() if streamer is not None: streamer.end() @@ -2944,10 +2919,8 @@ def _beam_search( generation_config: GaudiGenerationConfig, synced_gpus: bool, lazy_mode: Optional[bool] = False, - profiling_warmup_steps: Optional[int] = 0, - profiling_steps: Optional[int] = 0, + profiler: Optional[HabanaProfile] = None, hb_gen_time: Optional[HabanaGenerationTime] = None, - profiling_record_shapes: Optional[bool] = False, **model_kwargs, ) -> Union[GenerateBeamOutput, torch.LongTensor]: r""" @@ -2979,12 +2952,8 @@ def _beam_search( `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3). lazy_mode (`bool`, *optional*, defaults to `False`): Whether the run is executed in lazy mode or not (i.e. eager mode). - profiling_warmup_steps (`int`, *optional*, defaults to 0): - Number of steps to ignore for profling. - profiling_steps (`int`, *optional*, defaults to 0): - Number of steps to be captured when enabling profiling. - profiling_record_shapes (`bool`, *optional*, defaults to False): - Record shapes when enabling profiling. + profiler (`HabanaProfile`, *optional*, defaults to None): + HabanaProfile object to use for profiling. model_kwargs: Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is an encoder-decoder model the kwargs should include `encoder_outputs`. @@ -3172,10 +3141,9 @@ def expand_if_needed(tensor, new_size, value, dim=-1): input_ids = torch.stack(return_res) return input_ids - hb_profer = HabanaProfile( - warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=profiling_record_shapes - ) - hb_profer.start() + if profiler is not None: + profiler.start() + this_peer_finished = False bucket_size = model_kwargs.get("bucket_size", -1) @@ -3385,7 +3353,9 @@ def expand_if_needed(tensor, new_size, value, dim=-1): else: model_kwargs["cache_idx"] = model_kwargs["kv_cache_len"] - hb_profer.step() + if profiler is not None: + profiler.step() + if self.generation_config.static_shapes: is_min_length_reached = ( self.generation_config.min_length and cur_len >= self.generation_config.min_length @@ -3399,7 +3369,9 @@ def expand_if_needed(tensor, new_size, value, dim=-1): ): this_peer_finished = True - hb_profer.step() + if profiler is not None: + profiler.step() + if hb_gen_time is not None: if not time_to_first_token_done: time_to_first_token_done = True @@ -3436,7 +3408,8 @@ def expand_if_needed(tensor, new_size, value, dim=-1): # Delete past key value tensors self._remove_past_key_values(model_kwargs) - hb_profer.stop() + if profiler is not None: + profiler.stop() if self.generation_config.static_shapes: beam_trace = (beam_trace_idx, beam_trace_scores, beam_trace_indices, beam_trace_tokens) @@ -3515,10 +3488,8 @@ def _group_beam_search( generation_config: GaudiGenerationConfig, synced_gpus: bool, lazy_mode: Optional[bool] = False, - profiling_warmup_steps: Optional[int] = 0, - profiling_steps: Optional[int] = 0, + profiler: Optional[HabanaProfile] = None, hb_gen_time: Optional[HabanaGenerationTime] = None, - profiling_record_shapes: Optional[bool] = False, **model_kwargs, ): r""" @@ -3544,12 +3515,8 @@ def _group_beam_search( `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3). lazy_mode (`bool`, *optional*, defaults to `False`): Whether the run is executed in lazy mode or not (i.e. eager mode). - profiling_warmup_steps (`int`, *optional*, defaults to 0): - Number of steps to ignore for profling. - profiling_steps (`int`, *optional*, defaults to 0): - Number of steps to be captured when enabling profiling. - profiling_record_shapes (`bool`, *optional*, defaults to False): - Record shapes when enabling profiling. + profiler (`HabanaProfile`, *optional*, defaults to None): + HabanaProfile object to use for profiling. model_kwargs: Additional model specific kwargs that will be forwarded to the `forward` function of the model. If model is an encoder-decoder model the kwargs should include `encoder_outputs`. @@ -3573,10 +3540,8 @@ def _constrained_beam_search( generation_config: GaudiGenerationConfig, synced_gpus: bool, lazy_mode: Optional[bool] = False, - profiling_warmup_steps: Optional[int] = 0, - profiling_steps: Optional[int] = 0, + profiler: Optional[HabanaProfile] = None, hb_gen_time: Optional[HabanaGenerationTime] = None, - profiling_record_shapes: Optional[bool] = False, **model_kwargs, ) -> Union[GenerateBeamOutput, torch.LongTensor]: r""" @@ -3603,12 +3568,8 @@ def _constrained_beam_search( `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3). lazy_mode (`bool`, *optional*, defaults to `False`): Whether the run is executed in lazy mode or not (i.e. eager mode). - profiling_warmup_steps (`int`, *optional*, defaults to 0): - Number of steps to ignore for profling. - profiling_steps (`int`, *optional*, defaults to 0): - Number of steps to be captured when enabling profiling. - profiling_record_shapes (`bool`, *optional*, defaults to False): - Record shapes when enabling profiling. + profiler (`HabanaProfile`, *optional*, defaults to None): + HabanaProfile object to use for profiling. model_kwargs: Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is an encoder-decoder model the kwargs should include `encoder_outputs`. @@ -3677,10 +3638,8 @@ def _constrained_beam_search( else: decoder_prompt_len = input_ids.shape[-1] - hb_profer = HabanaProfile( - warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=profiling_record_shapes - ) - hb_profer.start() + if profiler is not None: + profiler.start() time_to_first_token_done = False while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): @@ -3810,7 +3769,8 @@ def _constrained_beam_search( # increase cur_len cur_len = cur_len + 1 - hb_profer.step() + if profiler is not None: + profiler.step() if constrained_beam_scorer.is_done or get_final_stopping_criteria( stopping_criteria(input_ids, scores, token_idx=cur_len) @@ -3825,7 +3785,9 @@ def _constrained_beam_search( torch_hpu.synchronize() hb_gen_time.step() - hb_profer.stop() + if profiler is not None: + profiler.stop() + sequence_outputs = constrained_beam_scorer.finalize( input_ids, beam_scores, @@ -3880,10 +3842,8 @@ def _assisted_decoding( streamer: Optional["BaseStreamer"], lazy_mode: Optional[bool] = False, ignore_eos: Optional[bool] = False, - profiling_warmup_steps: Optional[int] = 0, - profiling_steps: Optional[int] = 0, + profiler: Optional[HabanaProfile] = None, hb_gen_time: Optional[HabanaGenerationTime] = None, - profiling_record_shapes: Optional[bool] = False, **model_kwargs, ) -> Union[GenerateNonBeamOutput, torch.LongTensor]: r""" @@ -3914,12 +3874,8 @@ def _assisted_decoding( through `streamer.put(token_ids)` and the streamer is responsible for any further processing. lazy_mode (`bool`, *optional*, defaults to `False`): Whether the run is executed in lazy mode or not (i.e. eager mode). - profiling_warmup_steps (`int`, *optional*, defaults to 0): - Number of steps to ignore for profling. - profiling_steps (`int`, *optional*, defaults to 0): - Number of steps to be captured when enabling profiling. - profiling_record_shapes (`bool`, *optional*, defaults to False): - Record shapes when enabling profiling. + profiler (`HabanaProfile`, *optional*, defaults to None): + HabanaProfile object to use for profiling. model_kwargs: Additional model specific keyword arguments will be forwarded to the `forward` function of the model. If model is an encoder-decoder model the kwargs should include `encoder_outputs`. @@ -3959,8 +3915,9 @@ def _assisted_decoding( unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device) model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) - hb_profer = HabanaProfile(warmup=profiling_warmup_steps, active=profiling_steps) - hb_profer.start() + if profiler is not None: + profiler.start() + this_peer_finished = False is_first_iteration = True # to preserve the same API in the output as other generation methods @@ -4157,12 +4114,16 @@ def _assisted_decoding( torch_hpu.synchronize() hb_gen_time.step() - hb_profer.step() + + if profiler is not None: + profiler.step() if this_peer_finished and not synced_gpus: break - hb_profer.stop() + if profiler is not None: + profiler.stop() + if streamer is not None: streamer.end() diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py index 1bb6a1f570..28180a56ee 100644 --- a/optimum/habana/transformers/trainer.py +++ b/optimum/habana/transformers/trainer.py @@ -918,6 +918,7 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args, use_reentrant: Optio active=self.args.profiling_steps, record_shapes=self.args.profiling_record_shapes, with_stack=self.args.profiling_with_stack, + name="train", ) hb_profiler.start() @@ -1987,6 +1988,15 @@ def evaluation_loop( # set a default dtype of logits logits_dtype: str = "float32" + hb_profiler = HabanaProfile( + warmup=self.args.profiling_warmup_steps_eval, + active=self.args.profiling_steps_eval, + record_shapes=self.args.profiling_record_shapes, + with_stack=self.args.profiling_with_stack, + name=description.lower(), + ) + hb_profiler.start() + # Main evaluation loop start_time_eval = time.time() for step, inputs in enumerate(dataloader): @@ -2077,6 +2087,10 @@ def evaluation_loop( if args.use_lazy_mode: self.htcore.mark_step() + hb_profiler.step() + + hb_profiler.stop() + # After all calls to `.gather_function`, reset to `gather_for_metrics`: self.gather_function = self.accelerator.gather_for_metrics if args.past_index and hasattr(self, "_past"): diff --git a/optimum/habana/transformers/training_args.py b/optimum/habana/transformers/training_args.py index 1cd7b0305e..b7f9a6ff40 100644 --- a/optimum/habana/transformers/training_args.py +++ b/optimum/habana/transformers/training_args.py @@ -136,9 +136,13 @@ class GaudiTrainingArguments(TrainingArguments): non_blocking_data_copy (`bool`, *optional*, defaults to `False`): Whether to enable async data copy when preparing inputs. profiling_warmup_steps (`int`, *optional*, defaults to 0): - Number of steps to ignore for profiling. + Number of training steps to ignore for profiling. profiling_steps (`int`, *optional*, defaults to 0): - Number of steps to be captured when enabling profiling. + Number of training steps to be captured when enabling profiling. + profiling_warmup_steps_eval (`int`, *optional*, defaults to 0): + Number of eval steps to ignore for profiling. + profiling_steps_eval (`int`, *optional*, defaults to 0): + Number of eval steps to be captured when enabling profiling. """ use_habana: Optional[bool] = field( @@ -293,12 +297,22 @@ class GaudiTrainingArguments(TrainingArguments): profiling_warmup_steps: Optional[int] = field( default=0, - metadata={"help": ("Number of steps to ignore for profiling.")}, + metadata={"help": ("Number of training steps to ignore for profiling.")}, ) profiling_steps: Optional[int] = field( default=0, - metadata={"help": ("Number of steps to be captured when enabling profiling.")}, + metadata={"help": ("Number of training steps to be captured when enabling profiling.")}, + ) + + profiling_warmup_steps_eval: Optional[int] = field( + default=0, + metadata={"help": ("Number of eval steps to ignore for profiling.")}, + ) + + profiling_steps_eval: Optional[int] = field( + default=0, + metadata={"help": ("Number of eval steps to be captured when enabling profiling.")}, ) profiling_record_shapes: Optional[bool] = field( diff --git a/optimum/habana/utils.py b/optimum/habana/utils.py index 625128fcb6..74f42f909b 100755 --- a/optimum/habana/utils.py +++ b/optimum/habana/utils.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import random import subprocess import time @@ -290,12 +291,8 @@ def __exit__(self, exc_type, exc_val, exc_tb): self.step() -class HabanaProfile(object): - """ - HPU profiler only could be run once, so HABANA_PROFILE_ENABLED, a class static variable shared by all the instances of HabanaProfile, is used to control which part will be captured. - """ - - HABANA_PROFILE_ENABLED = True +class HabanaProfile: + _profilers = [] def __init__( self, @@ -303,65 +300,44 @@ def __init__( active: int = 0, record_shapes: bool = True, with_stack: bool = False, + name: str = "", output_dir: str = "./hpu_profile", wait: int = 0, ): - if active <= 0 or warmup < 0 or not HabanaProfile.HABANA_PROFILE_ENABLED: + self._profiler = None + self._running = False - def noop(): - pass + if active <= 0: + self.start = self.stop = self.step = lambda: None - self.start = noop - self.stop = noop - self.step = noop else: - HabanaProfile.HABANA_PROFILE_ENABLED = False + output_dir = os.path.join(output_dir, name) + schedule = torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=1) activities = [torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.HPU] - - profiler = torch.profiler.profile( + self._profiler = torch.profiler.profile( schedule=schedule, activities=activities, on_trace_ready=torch.profiler.tensorboard_trace_handler(output_dir), record_shapes=record_shapes, with_stack=with_stack, ) - self.start = profiler.start - self.stop = profiler.stop - self.step = profiler.step - HabanaProfile.enable.invalid = True - HabanaProfile.disable.invalid = True - - def stop(self): - self.stop() + self._profilers.append(self) def start(self): - self.start() + if any(p._running for p in self._profilers): + raise RuntimeError("Cannot start profiler, another profiler instance is already running") + self._running = True + self._profiler.start() - def step(self): - self.step() + def stop(self): + if self._running: + self._profiler.stop() + self._running = False - @staticmethod - def disable(): - """ - Runs only once and must happen before doing profiling. - """ - if hasattr(HabanaProfile.disable, "invalid"): - if not HabanaProfile.disable.invalid: - HabanaProfile.HABANA_PROFILE_ENABLED = False - else: - HabanaProfile.HABANA_PROFILE_ENABLED = False - - @staticmethod - def enable(): - """ - Runs only once and must happen before doing profiling. - """ - if hasattr(HabanaProfile.enable, "invalid"): - if not HabanaProfile.enable.invalid: - HabanaProfile.HABANA_PROFILE_ENABLED = True - else: - HabanaProfile.HABANA_PROFILE_ENABLED = True + def step(self): + if self._running: + self._profiler.step() def check_optimum_habana_min_version(min_version): diff --git a/tests/test_habana_profiler_integration.py b/tests/test_habana_profiler_integration.py new file mode 100644 index 0000000000..cb17bfba5c --- /dev/null +++ b/tests/test_habana_profiler_integration.py @@ -0,0 +1,120 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +import subprocess +from pathlib import Path +from tempfile import TemporaryDirectory + +import pytest + + +@pytest.fixture +def oh_path(): + cwd = Path.cwd() + if cwd.name.startswith("optimum-habana"): + oh_path = cwd + for parent in cwd.parents: + if parent.name.startswith("optimum-habana"): + oh_path = parent + return oh_path.resolve() + + +@pytest.fixture +def profiling_dir(oh_path): + p = oh_path / "hpu_profile" + yield p + if p.exists(): + shutil.rmtree(p) + + +@pytest.fixture +def temp_dir(): + td = TemporaryDirectory() + yield td.name + td.cleanup() + + +def install_requirements(requirements_file_path): + print(f"Installing {requirements_file_path}") + p = subprocess.run(f"pip install -r {requirements_file_path}", shell=True) + assert p.returncode == 0, f"Failed to install {requirements_file_path}" + + +def run_command_and_check_profiler_output(command, expected_directories, expected_num_files): + print(f"\nRunning command: {command}") + p = subprocess.run(command, shell=True) + rc = p.returncode + stdout = "" if p.stdout is None else p.stdout.decode() + stderr = "" if p.stderr is None else p.stderr.decode() + if rc != 0: + msg = f"Command failed with return code {rc}\nstdout: {stdout}\nstderr: {stderr}" + assert rc == 0, msg + + for expected_dir in expected_directories: + assert expected_dir.exists(), f"No profiling directory {expected_dir}" + assert len(list(expected_dir.glob("*.json"))) == expected_num_files + + +def test_integration_train_and_eval(oh_path, profiling_dir, temp_dir): + command = ( + f"python3 {oh_path}/examples/text-classification/run_glue.py " + "--model_name_or_path bert-large-uncased-whole-word-masking " + "--gaudi_config_name Habana/bert-large-uncased-whole-word-masking " + f"--task_name mrpc --do_train --output_dir {temp_dir} " + "--overwrite_output_dir --learning_rate 3e-05 " + "--per_device_train_batch_size 1 --per_device_eval_batch_size 1 " + "--num_train_epochs 1 --use_habana --throughput_warmup_steps 1 " + "--save_strategy no --use_lazy_mode --do_eval --max_seq_length 128 " + "--use_hpu_graphs_for_inference --sdp_on_bf16 --profiling_steps 1 " + "--profiling_warmup_steps 1 --profiling_steps_eval 1 " + "--profiling_warmup_steps_eval 1" + ) + install_requirements(f"{oh_path}/examples/text-classification/requirements.txt") + expected_dirs = [ + profiling_dir / "train", + profiling_dir / "evaluation", + ] + run_command_and_check_profiler_output(command, expected_dirs, expected_num_files=1) + + +def test_integration_text_generation(oh_path, profiling_dir, temp_dir): + command = ( + f"python3 {oh_path}/examples/text-generation/run_generation.py " + "--model_name_or_path bigscience/bloomz-7b1 --batch_size 1 --use_kv_cache " + f"--max_new_tokens 100 --use_hpu_graphs --bf16 --output_dir {temp_dir} " + "--profiling_steps 1 --profiling_warmup_steps 1" + ) + install_requirements(f"{oh_path}/examples/text-generation/requirements.txt") + expected_dirs = [profiling_dir / "generate"] + run_command_and_check_profiler_output(command, expected_dirs, expected_num_files=1) + + +@pytest.mark.x8 +def test_integration_stable_diffusion(oh_path, profiling_dir, temp_dir): + world_size = 8 + command = ( + f"python {oh_path}/examples/gaudi_spawn.py --world_size {world_size} " + f"{oh_path}/examples/stable-diffusion/text_to_image_generation.py " + "--model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 " + '--prompts "Sailing ship painting by Van Gogh" --num_images_per_prompt 1 ' + f"--batch_size 1 --image_save_dir {temp_dir} --scheduler euler_discrete " + "--use_habana --use_hpu_graphs --gaudi_config Habana/stable-diffusion --bf16 " + "--num_inference_steps 10 --optimize --sdp_on_bf16 " + "--profiling_steps 1 --profiling_warmup_steps 1 --distributed" + ) + install_requirements(f"{oh_path}/examples/stable-diffusion/requirements.txt") + expected_dirs = [profiling_dir / "stable_diffusion"] + run_command_and_check_profiler_output(command, expected_dirs, expected_num_files=world_size) diff --git a/tests/test_habana_profiler_unit.py b/tests/test_habana_profiler_unit.py new file mode 100644 index 0000000000..646e604866 --- /dev/null +++ b/tests/test_habana_profiler_unit.py @@ -0,0 +1,125 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +from unittest.mock import MagicMock + +import pytest + +from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi +from optimum.habana.utils import HabanaProfile + + +adapt_transformers_to_gaudi() + + +PROFILER_OUTPUT_DIR = "./hpu_profile" + + +@pytest.fixture +def patched_profiler(monkeypatch): + p = HabanaProfile(warmup=1, active=1) + mock_start = MagicMock() + mock_stop = MagicMock() + mock_step = MagicMock() + monkeypatch.setattr(p._profiler, "start", mock_start) + monkeypatch.setattr(p._profiler, "stop", mock_stop) + monkeypatch.setattr(p._profiler, "step", mock_step) + yield p + + +@pytest.fixture(autouse=True) +def cleanup(): + shutil.rmtree(PROFILER_OUTPUT_DIR, ignore_errors=True) + HabanaProfile._profilers = [] + + +def run_profiling(profiler): + profiler.start() + for _ in range(2): + profiler.step() + profiler.stop() + + +def test_init_profiler_with_no_steps(): + profiler = HabanaProfile() + assert profiler._profiler is None + assert profiler.start() is None + assert not profiler._running + assert profiler.step() is None + assert profiler.stop() is None + + +def test_init_profiler_with_steps(patched_profiler): + assert not patched_profiler._running + assert patched_profiler._profiler is not None + + +def test_start_profiling(patched_profiler): + patched_profiler.start() + assert patched_profiler._running + patched_profiler._profiler.start.assert_called_once() + + +def test_call_step_on_profiler(patched_profiler): + patched_profiler.start() + patched_profiler.step() + assert patched_profiler._running + patched_profiler._profiler.step.assert_called_once() + + +def test_stop_profiling(patched_profiler): + patched_profiler.start() + patched_profiler.stop() + assert not patched_profiler._running + patched_profiler._profiler.stop.assert_called_once() + + +def test_profiler_files(): + profiler = HabanaProfile(warmup=1, active=1) + run_profiling(profiler) + assert os.path.exists(PROFILER_OUTPUT_DIR) + assert len(os.listdir(PROFILER_OUTPUT_DIR)) == 1 + + +def test_profiler_with_name(): + profiler = HabanaProfile(warmup=1, active=1, name="test") + run_profiling(profiler) + expected_dir = os.path.join(PROFILER_OUTPUT_DIR, "test") + assert os.path.exists(expected_dir) + assert len(os.listdir(expected_dir)) == 1 + + +def test_profiler_with_no_steps_doesnt_run(): + profiler = HabanaProfile() + run_profiling(profiler) + assert not os.path.exists(PROFILER_OUTPUT_DIR) + + +def test_two_profilers_can_run_sequentially(): + profiler_0 = HabanaProfile(warmup=1, active=1) + run_profiling(profiler_0) + profiler_1 = HabanaProfile(warmup=1, active=1) + run_profiling(profiler_1) + assert os.path.exists(PROFILER_OUTPUT_DIR) + assert len(os.listdir(PROFILER_OUTPUT_DIR)) == 2 + + +def test_cannot_start_profiler_when_another_is_running(patched_profiler): + another_profiler = HabanaProfile(warmup=1, active=1) + patched_profiler.start() + with pytest.raises(RuntimeError): + another_profiler.start()