diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py index ea5e002450..f5eae87362 100755 --- a/examples/summarization/run_summarization.py +++ b/examples/summarization/run_summarization.py @@ -375,6 +375,12 @@ def main(): token=model_args.token, ) + if training_args.do_train and training_args.use_compiled_autograd: + from habana_frameworks.torch.dynamo.compile_backend.experimental import enable_compiled_autograd + + enable_compiled_autograd() + torch._C._set_autograd_fallback_mode("nothing") + # Log on each process the small summary: mixed_precision = training_args.bf16 or gaudi_config.use_torch_autocast logger.warning( diff --git a/optimum/habana/accelerate/accelerator.py b/optimum/habana/accelerate/accelerator.py index 38e0ea9da7..3a916d7ca3 100644 --- a/optimum/habana/accelerate/accelerator.py +++ b/optimum/habana/accelerate/accelerator.py @@ -118,6 +118,7 @@ def __init__( step_scheduler_with_optimizer: bool = True, kwargs_handlers: list[KwargsHandler] | None = None, dynamo_backend: GaudiDynamoBackend | str | None = None, + dynamic: bool | None = None, distribution_strategy: str = None, force_autocast: bool = False, ): @@ -310,6 +311,7 @@ def __init__( FutureWarning, ) self.step_scheduler_with_optimizer = step_scheduler_with_optimizer + self.dynamic = dynamic # Mixed precision attributes self.scaler = None @@ -776,7 +778,7 @@ def _prepare_deepspeed(self, *args): if self.state.dynamo_plugin.backend == GaudiDynamoBackend.HPU_BACKEND and not is_compiled_module( kwargs["model"] ): - engine.compile() + engine.compile(compile_kwargs={"dynamic": self.dynamic}) if optimizer is not None: optimizer = DeepSpeedOptimizerWrapper(optimizer) if scheduler is not None: diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py index 5c418e66b7..6f45828af0 100644 --- a/optimum/habana/transformers/trainer.py +++ b/optimum/habana/transformers/trainer.py @@ -2409,6 +2409,7 @@ def create_accelerator_and_postprocess(self): "deepspeed_plugin": self.args.deepspeed_plugin, "gradient_accumulation_plugin": gradient_accumulation_plugin, "distribution_strategy": self.args.distribution_strategy, + "dynamic": self.args.compile_dynamic, } if is_accelerate_available("0.28.0"): args["dataloader_config"] = dataloader_config diff --git a/optimum/habana/transformers/training_args.py b/optimum/habana/transformers/training_args.py index 5a65074fc9..82ba68d3b2 100644 --- a/optimum/habana/transformers/training_args.py +++ b/optimum/habana/transformers/training_args.py @@ -97,6 +97,10 @@ class GaudiTrainingArguments(TrainingArguments): Whether to use HPU graphs for performing inference. It will speed up latency but may not be compatible with some operations. use_hpu_graphs_for_training (`bool`, *optional*, defaults to `False`): Whether to use HPU graphs for performing inference. It will speed up training but may not be compatible with some operations. + use_compiled_autograd (`bool`, *optional*, defaults to `False`): + Whether to use compiled autograd for training. Currently only for summarization models. + compile_dynamic (`bool|None`, *optional*, defaults to `None`): + Set value of 'dynamic' parameter for torch.compile. disable_tensor_cache_hpu_graphs (`bool`, *optional*, defaults to `False`): Whether to disable tensor cache when using hpu graphs. If True, tensors won't be cached in hpu graph and memory can be saved. max_hpu_graphs (`int`, *optional*): @@ -156,6 +160,16 @@ class GaudiTrainingArguments(TrainingArguments): }, ) + use_compiled_autograd: Optional[bool] = field( + default=False, + metadata={"help": ("Whether to use compiled autograd for training. Currently only for summarization models.")}, + ) + + compile_dynamic: Optional[bool | None] = field( + default=None, + metadata={"help": ("Set value of 'dynamic' parameter for torch.compile.")}, + ) + disable_tensor_cache_hpu_graphs: Optional[bool] = field( default=False, metadata={"help": "Whether to use a tensor cache for hpu graphs."},