From 533b80b977c371a006811200631e4c9ae5af4efc Mon Sep 17 00:00:00 2001 From: Vivek Date: Tue, 19 Mar 2024 11:01:33 +0200 Subject: [PATCH] Add training args to capture device profiling traces --- .../transformers/synapse_profiler_api.py | 80 +++++++++++++++++++ optimum/habana/transformers/trainer.py | 18 +++++ optimum/habana/transformers/training_args.py | 25 +++++- 3 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 optimum/habana/transformers/synapse_profiler_api.py diff --git a/optimum/habana/transformers/synapse_profiler_api.py b/optimum/habana/transformers/synapse_profiler_api.py new file mode 100644 index 0000000000..a5f2ba27df --- /dev/null +++ b/optimum/habana/transformers/synapse_profiler_api.py @@ -0,0 +1,80 @@ +import os +from ctypes import * +from enum import Enum + + +path = os.path.dirname(os.getenv('GC_KERNEL_PATH').split(':')[0]) +dev_path = os.getenv('BUILD_ROOT_LATEST') +if dev_path: + path = dev_path + print("using dev environment {}".format(path)) + + +def return_c_func(library_name, function_name): + func = eval("cdll.LoadLibrary('{}').{}".format(library_name, function_name)) + return func + + +class TraceType(Enum): + TraceAll = 1, + TraceHost = 2, + TraceDevice = 3, + TraceTypeSize = 4 + + +class TraceFormat(Enum): + TraceFormatTEF = 1, + TraceFormatSize = 2 + + +class SynapseProfilerApi: + + def __init__(self): + self.lib_file = 'libSynapse.so' + self.full_path = os.path.join(path, self.lib_file) + self.profiler_start_call = return_c_func(self.full_path, 'synProfilerStart') + self.profiler_start_call.restype = c_int + self.profiler_stop_call = return_c_func(self.full_path, 'synProfilerStop') + self.profiler_stop_call.restype = c_int + self.profiler_get_trace_call = return_c_func(self.full_path, 'synProfilerGetTrace') + self.profiler_get_trace_call.restype = c_int + self.profiler_sync_call = return_c_func(self.full_path, 'synDeviceSynchronize') + self.profiler_sync_call.restype = c_int + + def profiler_start(self, trace_type: TraceType, device_id: int): + int32_device_id = c_int32(device_id) + int_trace_type = c_int(trace_type.value[0]) + import habana_frameworks.torch.utils.experimental as htexp + htexp._set_profiler_tracer_memory(device_id) + return self.profiler_start_call(int_trace_type, int32_device_id) + + def profiler_sync(self, device_id: int): + int32_device_id = c_int32(device_id) + return self.profiler_sync_call(int32_device_id) + + def profiler_stop(self, trace_type: TraceType, device_id: int): + int32_device_id = c_int32(device_id) + int_trace_type = c_int(trace_type.value[0]) + return self.profiler_stop_call(int_trace_type, int32_device_id) + + def profiler_get_trace_json(self, trace_type: TraceType, device_id: int): + int32_device_id = c_int32(device_id) + int_trace_type = c_int(trace_type.value[0]) + int_trace_format = c_int(TraceFormat.TraceFormatTEF.value[0]) + result = self.profiler_get_trace_call(int_trace_type, int32_device_id, int_trace_format, None, None, None) + return result + + def profiler_get_trace(self, trace_type: TraceType, device_id: int): + int32_device_id = c_int32(device_id) + int_trace_type = c_int(trace_type.value[0]) + int_trace_format = c_int(TraceFormat.TraceFormatTEF.value[0]) + buffer_size = c_size_t(0) + num_entries = c_size_t(0) + self.profiler_get_trace_call(int_trace_type, int32_device_id, int_trace_format, None, byref(buffer_size), byref(num_entries)) + buffer = cast((c_byte * buffer_size.value)(), c_char_p) + result = self.profiler_get_trace_call(int_trace_type, int32_device_id, int_trace_format, buffer, + byref(buffer_size), byref(num_entries)) + with open('profiler_data.bin', 'wb') as f: + f.write(buffer) + + return result diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py index 030d931256..f95acd984d 100644 --- a/optimum/habana/transformers/trainer.py +++ b/optimum/habana/transformers/trainer.py @@ -832,6 +832,11 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args): ) hb_profiler.start() + if self.args.device_profiling_enable: + from .synapse_profiler_api import SynapseProfilerApi, TraceType + api = SynapseProfilerApi() + curr_step = 0 + total_batched_samples = 0 for epoch in range(epochs_trained, num_train_epochs): epoch_iterator = train_dataloader @@ -914,6 +919,10 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args): if self.model.generation_config.use_fused_rope is False: inputs["use_fused_rope"] = False + if self.args.device_profiling_enable: + if curr_step == self.args.start_device_profiling_step: + api.profiler_start(TraceType.TraceAll, 0) + # TODO: keep syncs for fast DDP? with self.accelerator.accumulate(model): tr_loss_step = self.training_step(model, inputs) @@ -990,6 +999,15 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args): self.control = self.callback_handler.on_substep_end(args, self.state, self.control) hb_profiler.step() + + if self.args.device_profiling_enable: + if curr_step == self.args.stop_device_profiling_step: + import habana_frameworks.torch.hpu as hpu + hpu.synchronize() + api.profiler_stop(TraceType.TraceAll, 0) + api.profiler_get_trace_json(TraceType.TraceAll, 0) + curr_step = curr_step + 1 + if self.control.should_epoch_stop or self.control.should_training_stop: break if step < 0: diff --git a/optimum/habana/transformers/training_args.py b/optimum/habana/transformers/training_args.py index c921316510..a5eaabff48 100644 --- a/optimum/habana/transformers/training_args.py +++ b/optimum/habana/transformers/training_args.py @@ -115,6 +115,14 @@ class GaudiTrainingArguments(TrainingArguments): Number of steps to ignore for profling. profiling_steps (`int`, *optional*, defaults to 0): Number of steps to be captured when enabling profiling. + profiling_record_shapes (`boolean`, *optional*, defaults to True): + Record shapes when enabling profiling. + start_device_profiling_step (`int`, *optional*, defaults to -1): + Training iteration number where device profiling starts. + stop_device_profiling_step (`int`, *optional*, defaults to -1): + Training iteration number where device profiling stops. + device_profiling_enable (`boolean`, *optional*, defaults to False): + Enable device profiling. """ use_habana: Optional[bool] = field( @@ -225,6 +233,22 @@ class GaudiTrainingArguments(TrainingArguments): default=True, metadata={"help": ("Record shapes when enabling profiling.")}, ) + + start_device_profiling_step: Optional[int] = field( + default=0, + metadata={"help": ("Training iteration number where device profiling starts.")}, + ) + + stop_device_profiling_step: Optional[int] = field( + default=0, + metadata={"help": ("Training iteration number where device profiling stops.")}, + ) + + device_profiling_enable: Optional[bool] = field( + default=False, + metadata={"help": ("Enable device profiler.")}, + ) + # Overriding the default value of optim because 'adamw_hf' is deprecated optim: Optional[Union[OptimizerNames, str]] = field( default="adamw_torch", @@ -571,7 +595,6 @@ def __post_init__(self): raise ValueError("`min_num_params` and `transformer_layer_cls_to_wrap` are mutually exclusive.") self.fsdp_config["xla"] = self.fsdp_config.get("xla", False) self.fsdp_config["xla_fsdp_grad_ckpt"] = self.fsdp_config.get("xla_fsdp_grad_ckpt", False) - # accelerate integration for FSDP if len(self.fsdp) > 0 and not self.fsdp_config["xla"]: os.environ["ACCELERATE_USE_FSDP"] = "true"