Skip to content

Commit

Permalink
Pass .scale instead of scaler object to core (NVIDIA#6551)
Browse files Browse the repository at this point in the history
* pass .scale instead of scaler object to core (NVIDIA#6545)

Signed-off-by: Abhinav Khattar <[email protected]>
Co-authored-by: Eric Harper <[email protected]>

* Update megatron_gpt_model.py

Signed-off-by: Abhinav Khattar <[email protected]>

* scale changes for main

Signed-off-by: Abhinav Khattar <[email protected]>

---------

Signed-off-by: Abhinav Khattar <[email protected]>
Co-authored-by: Abhinav Khattar <[email protected]>
Co-authored-by: Eric Harper <[email protected]>
Signed-off-by: hsiehjackson <[email protected]>
  • Loading branch information
3 people authored and hsiehjackson committed Jun 2, 2023
1 parent d0785d5 commit c7f58d8
Show file tree
Hide file tree
Showing 9 changed files with 8 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ def training_step(self, dataloader_iter, batch_idx):
forward_only=False,
tensor_shape=tensor_shape,
dtype=self.autocast_dtype,
grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None,
sequence_parallel=self.cfg.get('sequence_parallel', False),
enable_autocast=True,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
tensor_shape=tensor_shape,
decoder_seq_length=dec_seq_length,
dtype=self.autocast_dtype,
grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None,
sequence_parallel=self.cfg.get('sequence_parallel', False),
enable_autocast=True,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
forward_only=forward_only,
tensor_shape=tensor_shape,
dtype=self.autocast_dtype,
grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None,
sequence_parallel=self.cfg.get('sequence_parallel', False),
enable_autocast=True,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
forward_only=forward_only,
tensor_shape=tensor_shape,
dtype=self.autocast_dtype,
grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None,
sequence_parallel=self.cfg.get('sequence_parallel', False),
enable_autocast=True,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
forward_only=forward_only,
tensor_shape=tensor_shape,
dtype=self.autocast_dtype,
grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None,
sequence_parallel=self.cfg.get('sequence_parallel', False),
enable_autocast=True,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
tensor_shape=tensor_shape,
decoder_seq_length=self.max_decoder_seq_length,
dtype=self.autocast_dtype,
grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None,
enable_autocast=True,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
tensor_shape=tensor_shape,
decoder_seq_length=dec_seq_length,
dtype=self.autocast_dtype,
grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None,
sequence_parallel=self.cfg.get('sequence_parallel', False),
enable_autocast=True,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only):
tensor_shape=tensor_shape,
decoder_seq_length=decoder_seq_length,
dtype=self.autocast_dtype,
grad_scaler=self.trainer.precision_plugin.scaler if self.cfg.precision == 16 else None,
grad_scaler=self.trainer.precision_plugin.scaler.scale if self.cfg.precision == 16 else None,
sequence_parallel=self.cfg.get('sequence_parallel', False),
enable_autocast=True,
)
Expand Down
3 changes: 0 additions & 3 deletions nemo/collections/nlp/parts/nlp_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,9 +546,6 @@ def __init__(
self.hysteresis = hysteresis
self._hysteresis_tracker = self.hysteresis

def __call__(self, outputs):
return self.scale(outputs)

def _unscale_grads_(self, optimizer, *args):
if getattr(optimizer, "_custom_amp_unscale_grads", False):
return optimizer.unscale_grads(*args)
Expand Down

0 comments on commit c7f58d8

Please sign in to comment.