Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions optimum/habana/transformers/synapse_profiler_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import os
from ctypes import *
from enum import Enum


path = os.path.dirname(os.getenv('GC_KERNEL_PATH').split(':')[0])
dev_path = os.getenv('BUILD_ROOT_LATEST')
if dev_path:
path = dev_path
print("using dev environment {}".format(path))


def return_c_func(library_name, function_name):
func = eval("cdll.LoadLibrary('{}').{}".format(library_name, function_name))
return func


class TraceType(Enum):
TraceAll = 1,
TraceHost = 2,
TraceDevice = 3,
TraceTypeSize = 4


class TraceFormat(Enum):
TraceFormatTEF = 1,
TraceFormatSize = 2


class SynapseProfilerApi:

def __init__(self):
self.lib_file = 'libSynapse.so'
self.full_path = os.path.join(path, self.lib_file)
self.profiler_start_call = return_c_func(self.full_path, 'synProfilerStart')
self.profiler_start_call.restype = c_int
self.profiler_stop_call = return_c_func(self.full_path, 'synProfilerStop')
self.profiler_stop_call.restype = c_int
self.profiler_get_trace_call = return_c_func(self.full_path, 'synProfilerGetTrace')
self.profiler_get_trace_call.restype = c_int
self.profiler_sync_call = return_c_func(self.full_path, 'synDeviceSynchronize')
self.profiler_sync_call.restype = c_int

def profiler_start(self, trace_type: TraceType, device_id: int):
int32_device_id = c_int32(device_id)
int_trace_type = c_int(trace_type.value[0])
import habana_frameworks.torch.utils.experimental as htexp
htexp._set_profiler_tracer_memory(device_id)
return self.profiler_start_call(int_trace_type, int32_device_id)

def profiler_sync(self, device_id: int):
int32_device_id = c_int32(device_id)
return self.profiler_sync_call(int32_device_id)

def profiler_stop(self, trace_type: TraceType, device_id: int):
int32_device_id = c_int32(device_id)
int_trace_type = c_int(trace_type.value[0])
return self.profiler_stop_call(int_trace_type, int32_device_id)

def profiler_get_trace_json(self, trace_type: TraceType, device_id: int):
int32_device_id = c_int32(device_id)
int_trace_type = c_int(trace_type.value[0])
int_trace_format = c_int(TraceFormat.TraceFormatTEF.value[0])
result = self.profiler_get_trace_call(int_trace_type, int32_device_id, int_trace_format, None, None, None)
return result

def profiler_get_trace(self, trace_type: TraceType, device_id: int):
int32_device_id = c_int32(device_id)
int_trace_type = c_int(trace_type.value[0])
int_trace_format = c_int(TraceFormat.TraceFormatTEF.value[0])
buffer_size = c_size_t(0)
num_entries = c_size_t(0)
self.profiler_get_trace_call(int_trace_type, int32_device_id, int_trace_format, None, byref(buffer_size), byref(num_entries))
buffer = cast((c_byte * buffer_size.value)(), c_char_p)
result = self.profiler_get_trace_call(int_trace_type, int32_device_id, int_trace_format, buffer,
byref(buffer_size), byref(num_entries))
with open('profiler_data.bin', 'wb') as f:
f.write(buffer)

return result
18 changes: 18 additions & 0 deletions optimum/habana/transformers/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -832,6 +832,11 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args):
)
hb_profiler.start()

if self.args.device_profiling_enable:
from .synapse_profiler_api import SynapseProfilerApi, TraceType
api = SynapseProfilerApi()
curr_step = 0

total_batched_samples = 0
for epoch in range(epochs_trained, num_train_epochs):
epoch_iterator = train_dataloader
Expand Down Expand Up @@ -914,6 +919,10 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args):
if self.model.generation_config.use_fused_rope is False:
inputs["use_fused_rope"] = False

if self.args.device_profiling_enable:
if curr_step == self.args.start_device_profiling_step:
api.profiler_start(TraceType.TraceAll, 0)

# TODO: keep syncs for fast DDP?
with self.accelerator.accumulate(model):
tr_loss_step = self.training_step(model, inputs)
Expand Down Expand Up @@ -990,6 +999,15 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args):
self.control = self.callback_handler.on_substep_end(args, self.state, self.control)

hb_profiler.step()

if self.args.device_profiling_enable:
if curr_step == self.args.stop_device_profiling_step:
import habana_frameworks.torch.hpu as hpu
hpu.synchronize()
api.profiler_stop(TraceType.TraceAll, 0)
api.profiler_get_trace_json(TraceType.TraceAll, 0)
curr_step = curr_step + 1

if self.control.should_epoch_stop or self.control.should_training_stop:
break
if step < 0:
Expand Down
25 changes: 24 additions & 1 deletion optimum/habana/transformers/training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,14 @@ class GaudiTrainingArguments(TrainingArguments):
Number of steps to ignore for profling.
profiling_steps (`int`, *optional*, defaults to 0):
Number of steps to be captured when enabling profiling.
profiling_record_shapes (`boolean`, *optional*, defaults to True):

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where are we using it? Default value is taken from original transformers?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has nothing to do with current change. I just added a missing comment for PT profiler. Its being used in PT profiler initialization in trainer.py, it is habana specific argument (not there in original transformers).

Record shapes when enabling profiling.
start_device_profiling_step (`int`, *optional*, defaults to -1):
Training iteration number where device profiling starts.
stop_device_profiling_step (`int`, *optional*, defaults to -1):
Training iteration number where device profiling stops.
device_profiling_enable (`boolean`, *optional*, defaults to False):
Enable device profiling.
"""

use_habana: Optional[bool] = field(
Expand Down Expand Up @@ -225,6 +233,22 @@ class GaudiTrainingArguments(TrainingArguments):
default=True,
metadata={"help": ("Record shapes when enabling profiling.")},
)

start_device_profiling_step: Optional[int] = field(
default=0,
metadata={"help": ("Training iteration number where device profiling starts.")},
)

stop_device_profiling_step: Optional[int] = field(
default=0,
metadata={"help": ("Training iteration number where device profiling stops.")},
)

device_profiling_enable: Optional[bool] = field(
default=False,
metadata={"help": ("Enable device profiler.")},
)

# Overriding the default value of optim because 'adamw_hf' is deprecated
optim: Optional[Union[OptimizerNames, str]] = field(
default="adamw_torch",
Expand Down Expand Up @@ -571,7 +595,6 @@ def __post_init__(self):
raise ValueError("`min_num_params` and `transformer_layer_cls_to_wrap` are mutually exclusive.")
self.fsdp_config["xla"] = self.fsdp_config.get("xla", False)
self.fsdp_config["xla_fsdp_grad_ckpt"] = self.fsdp_config.get("xla_fsdp_grad_ckpt", False)

# accelerate integration for FSDP
if len(self.fsdp) > 0 and not self.fsdp_config["xla"]:
os.environ["ACCELERATE_USE_FSDP"] = "true"
Expand Down