HabanaAI · vivekgoe · Mar 19, 2024 · Mar 19, 2024 · bhargaveede · Mar 19, 2024
diff --git a/optimum/habana/transformers/synapse_profiler_api.py b/optimum/habana/transformers/synapse_profiler_api.py
@@ -0,0 +1,80 @@
+import os
+from ctypes import *
+from enum import Enum
+
+
+path = os.path.dirname(os.getenv('GC_KERNEL_PATH').split(':')[0])
+dev_path = os.getenv('BUILD_ROOT_LATEST')
+if dev_path:
+    path = dev_path
+    print("using dev environment {}".format(path))
+
+
+def return_c_func(library_name, function_name):
+    func = eval("cdll.LoadLibrary('{}').{}".format(library_name, function_name))
+    return func
+
+
+class TraceType(Enum):
+    TraceAll = 1,
+    TraceHost = 2,
+    TraceDevice = 3,
+    TraceTypeSize = 4
+
+
+class TraceFormat(Enum):
+    TraceFormatTEF = 1,
+    TraceFormatSize = 2
+
+
+class SynapseProfilerApi:
+
+    def __init__(self):
+        self.lib_file = 'libSynapse.so'
+        self.full_path = os.path.join(path, self.lib_file)
+        self.profiler_start_call = return_c_func(self.full_path, 'synProfilerStart')
+        self.profiler_start_call.restype = c_int
+        self.profiler_stop_call = return_c_func(self.full_path, 'synProfilerStop')
+        self.profiler_stop_call.restype = c_int
+        self.profiler_get_trace_call = return_c_func(self.full_path, 'synProfilerGetTrace')
+        self.profiler_get_trace_call.restype = c_int
+        self.profiler_sync_call = return_c_func(self.full_path, 'synDeviceSynchronize')
+        self.profiler_sync_call.restype = c_int
+
+    def profiler_start(self, trace_type: TraceType, device_id: int):
+        int32_device_id = c_int32(device_id)
+        int_trace_type = c_int(trace_type.value[0])
+        import habana_frameworks.torch.utils.experimental as htexp
+        htexp._set_profiler_tracer_memory(device_id)
+        return self.profiler_start_call(int_trace_type, int32_device_id)
+
+    def profiler_sync(self, device_id: int):
+        int32_device_id = c_int32(device_id)
+        return self.profiler_sync_call(int32_device_id)
+
+    def profiler_stop(self, trace_type: TraceType, device_id: int):
+        int32_device_id = c_int32(device_id)
+        int_trace_type = c_int(trace_type.value[0])
+        return self.profiler_stop_call(int_trace_type, int32_device_id)
+
+    def profiler_get_trace_json(self, trace_type: TraceType, device_id: int):
+        int32_device_id = c_int32(device_id)
+        int_trace_type = c_int(trace_type.value[0])
+        int_trace_format = c_int(TraceFormat.TraceFormatTEF.value[0])
+        result = self.profiler_get_trace_call(int_trace_type, int32_device_id, int_trace_format, None, None, None)
+        return result
+
+    def profiler_get_trace(self, trace_type: TraceType, device_id: int):
+        int32_device_id = c_int32(device_id)
+        int_trace_type = c_int(trace_type.value[0])
+        int_trace_format = c_int(TraceFormat.TraceFormatTEF.value[0])
+        buffer_size = c_size_t(0)
+        num_entries = c_size_t(0)
+        self.profiler_get_trace_call(int_trace_type, int32_device_id, int_trace_format, None, byref(buffer_size), byref(num_entries))
+        buffer = cast((c_byte * buffer_size.value)(), c_char_p)
+        result = self.profiler_get_trace_call(int_trace_type, int32_device_id, int_trace_format, buffer,
+                                              byref(buffer_size), byref(num_entries))
+        with open('profiler_data.bin', 'wb') as f:
+            f.write(buffer)
+
+        return result
diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py
@@ -832,6 +832,11 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args):
         )
         hb_profiler.start()
 
+        if self.args.device_profiling_enable:
+            from .synapse_profiler_api import SynapseProfilerApi, TraceType
+            api = SynapseProfilerApi()
+            curr_step = 0
+
         total_batched_samples = 0
         for epoch in range(epochs_trained, num_train_epochs):
             epoch_iterator = train_dataloader
@@ -914,6 +919,10 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args):
                         if self.model.generation_config.use_fused_rope is False:
                             inputs["use_fused_rope"] = False
 
+                if self.args.device_profiling_enable:
+                    if curr_step == self.args.start_device_profiling_step:
+                        api.profiler_start(TraceType.TraceAll, 0)
+
                 # TODO: keep syncs for fast DDP?
                 with self.accelerator.accumulate(model):
                     tr_loss_step = self.training_step(model, inputs)
@@ -990,6 +999,15 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args):
                     self.control = self.callback_handler.on_substep_end(args, self.state, self.control)
 
                 hb_profiler.step()
+
+                if self.args.device_profiling_enable:
+                    if curr_step == self.args.stop_device_profiling_step:
+                        import habana_frameworks.torch.hpu as hpu
+                        hpu.synchronize()
+                        api.profiler_stop(TraceType.TraceAll, 0)
+                        api.profiler_get_trace_json(TraceType.TraceAll, 0)
+                    curr_step = curr_step + 1
+
                 if self.control.should_epoch_stop or self.control.should_training_stop:
                     break
             if step < 0:

diff --git a/optimum/habana/transformers/training_args.py b/optimum/habana/transformers/training_args.py
@@ -115,6 +115,14 @@ class GaudiTrainingArguments(TrainingArguments):
             Number of steps to ignore for profling.
         profiling_steps (`int`, *optional*, defaults to 0):
             Number of steps to be captured when enabling profiling.
+        profiling_record_shapes (`boolean`, *optional*, defaults to True):
+            Record shapes when enabling profiling.
+        start_device_profiling_step (`int`, *optional*, defaults to -1):
+            Training iteration number where device profiling starts.
+        stop_device_profiling_step (`int`, *optional*, defaults to -1):
+            Training iteration number where device profiling stops.
+        device_profiling_enable (`boolean`, *optional*, defaults to False):
+            Enable device profiling.
     """
 
     use_habana: Optional[bool] = field(
@@ -225,6 +233,22 @@ class GaudiTrainingArguments(TrainingArguments):
         default=True,
         metadata={"help": ("Record shapes when enabling profiling.")},
     )
+
+    start_device_profiling_step: Optional[int] = field(
+        default=0,
+        metadata={"help": ("Training iteration number where device profiling starts.")},
+    )
+
+    stop_device_profiling_step: Optional[int] = field(
+        default=0,
+        metadata={"help": ("Training iteration number where device profiling stops.")},
+    )
+
+    device_profiling_enable: Optional[bool] = field(
+        default=False,
+        metadata={"help": ("Enable device profiler.")},
+    )
+
     # Overriding the default value of optim because 'adamw_hf' is deprecated
     optim: Optional[Union[OptimizerNames, str]] = field(
         default="adamw_torch",
@@ -571,7 +595,6 @@ def __post_init__(self):
             raise ValueError("`min_num_params` and `transformer_layer_cls_to_wrap` are mutually exclusive.")
         self.fsdp_config["xla"] = self.fsdp_config.get("xla", False)
         self.fsdp_config["xla_fsdp_grad_ckpt"] = self.fsdp_config.get("xla_fsdp_grad_ckpt", False)
-
         # accelerate integration for FSDP
         if len(self.fsdp) > 0 and not self.fsdp_config["xla"]:
             os.environ["ACCELERATE_USE_FSDP"] = "true"