Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions optimum/habana/transformers/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ def _inner_training_loop(
(self.model_wrapped,) = release_memory(self.model_wrapped)
self.model_wrapped = self.model

# Check for DeepSpeed *after* the intial pass and modify the config
# Check for DeepSpeed *after* the initial pass and modify the config
if self.is_deepspeed_enabled:
# Temporarily unset `self.args.train_batch_size`
original_bs = self.args.per_device_train_batch_size
Expand Down Expand Up @@ -686,14 +686,14 @@ def _inner_training_loop(

# HACK because outputs should always be tuples
def hpu_deepspeed_checkpointing(function, *checkpoint_args, use_reentrant: Optional[bool] = None):
"""DeepSpeed acitvation checkpointing."""
"""DeepSpeed activation checkpointing."""
if use_reentrant is None:
use_reentrant = True
if use_reentrant:
all_outputs = []
CheckpointFunction.apply(function, all_outputs, *checkpoint_args)
else:
logger.info("DeepSpeed acitvation checkpointing=non_reentrant_checkpoint")
logger.info("DeepSpeed activation checkpointing=non_reentrant_checkpoint")
all_outputs = non_reentrant_checkpoint(function, *checkpoint_args)

# Always return a tuple
Expand Down Expand Up @@ -863,7 +863,7 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args, use_reentrant: Optio

# tr_loss is a tensor to avoid synchronization of TPUs through .item()
tr_loss = torch.tensor(0.0).to(args.device)
# _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses
# _total_loss_scalar is updated every time .item() has to be called on tr_loss and stores the sum of all losses
self._total_loss_scalar = 0.0
self._globalstep_last_logged = self.state.global_step
self._zero_model_grad(model)
Expand Down Expand Up @@ -1433,7 +1433,7 @@ def _save_optimizer_and_scheduler(self, output_dir):
)
elif self.args.should_save:
# deepspeed.save_checkpoint above saves model/optim/sched
# This block is exectuted by the main process only
# This block is executed by the main process only
optim_dict = self.optimizer.state_dict()
if self.args.use_habana:
# Move the state dict from HPU to CPU before saving
Expand Down Expand Up @@ -1599,7 +1599,7 @@ def training_step(self, model: torch.nn.Module, inputs: Dict[str, Union[torch.Te
del inputs
kwargs = {}

# For LOMO optimizers you need to explicitly use the learnign rate
# For LOMO optimizers you need to explicitly use the learning rate
if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
kwargs["learning_rate"] = self._get_learning_rate()

Expand Down Expand Up @@ -1725,7 +1725,7 @@ def evaluate(
From https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/trainer.py#L3162 with the following modification
1. use throughput_warmup_steps in evaluation throughput calculation
"""
# handle multipe eval datasets
# handle multiple eval datasets
override = eval_dataset is not None
eval_dataset = eval_dataset if override else self.eval_dataset
if isinstance(eval_dataset, dict):
Expand Down