Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes for lightning 2.0 upgrade #7176

Merged
merged 10 commits into from
Aug 12, 2023
18 changes: 10 additions & 8 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -2583,11 +2583,13 @@ pipeline {
model.encoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model \
model.decoder_tokenizer.library=sentencepiece \
model.decoder_tokenizer.model=/home/TestData/nlp/nmt/toy_data/spm_64k_all_langs_plus_en.model"
// Change val_check_interval to 1 for resume as the len(dataloder) is 1 due to max_steps being the same as that of training and Lightning 2.0 raises an error
// if val_check_interval > len(dataloder: https://github.com/Lightning-AI/lightning/blob/2.0.6/src/lightning/pytorch/loops/fit_loop.py#L259 at the beginning of fit_loop.run()
sh "python examples/nlp/machine_translation/megatron_nmt_training.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.val_check_interval=1 \
+trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
Expand Down Expand Up @@ -3709,7 +3711,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
sh "python examples/nlp/language_modeling/tuning/megatron_gpt_sft.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.val_check_interval=1 \
+trainer.limit_val_batches=2 \
trainer.max_steps=3 \
trainer.precision=16 \
Expand Down Expand Up @@ -4134,7 +4136,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.val_check_interval=1 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
Expand Down Expand Up @@ -4230,7 +4232,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.val_check_interval=1 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
Expand Down Expand Up @@ -4326,7 +4328,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.val_check_interval=1 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
Expand Down Expand Up @@ -4409,7 +4411,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.val_check_interval=1 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
Expand Down Expand Up @@ -4659,7 +4661,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.val_check_interval=1 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
Expand Down Expand Up @@ -4821,7 +4823,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=10 \
trainer.val_check_interval=1 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=10 \
Expand Down
7 changes: 2 additions & 5 deletions examples/nlp/language_modeling/megatron_bart_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,9 @@ def main(cfg) -> None:

# update resume from checkpoint found by exp_manager
if cfg.model.resume_from_checkpoint is not None:
resume_from_checkpoint = cfg.model.resume_from_checkpoint
else:
resume_from_checkpoint = trainer._checkpoint_connector._ckpt_path
logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}')
trainer.ckpt_path = cfg.model.resume_from_checkpoint

trainer._checkpoint_connector = _CheckpointConnector(trainer)
logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')

# hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
with open_dict(cfg):
Expand Down
7 changes: 1 addition & 6 deletions examples/nlp/language_modeling/megatron_bert_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,8 @@ def main(cfg) -> None:

exp_manager(trainer, cfg.exp_manager)

# update resume from checkpoint found by exp_manager
# Avoid calling protected API trainer._checkpoint_connector._ckpt_path as lightning 2.0 supports ckpt_path as trainer arg
resume_from_checkpoint = trainer.ckpt_path
# resume_from_checkpoint = uninject_model_parallel_rank(resume_from_checkpoint)
logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}')

trainer._checkpoint_connector = _CheckpointConnector(trainer)
logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')

# hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
with open_dict(cfg):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,6 @@ def main(cfg) -> None:

logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')

trainer._checkpoint_connector = _CheckpointConnector(trainer)

if cfg.restore_from_path:
save_restore_connector = NLPSaveRestoreConnector()
if os.path.isdir(cfg.restore_from_path):
Expand Down
2 changes: 0 additions & 2 deletions examples/nlp/language_modeling/megatron_gpt_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,6 @@ def main(cfg) -> None:

logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')

trainer._checkpoint_connector = _CheckpointConnector(trainer)

# hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
with open_dict(cfg):
cfg.model.precision = cfg.trainer.precision
Expand Down
6 changes: 1 addition & 5 deletions examples/nlp/language_modeling/megatron_retro_fine_tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,7 @@ def main(cfg) -> None:
trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
exp_manager(trainer, cfg.exp_manager)

# update resume from checkpoint found by exp_manager
resume_from_checkpoint = trainer.ckpt_path
logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}')

trainer._checkpoint_connector = _CheckpointConnector(trainer)
logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')

# Override timer callback to a stateless one
for idx, callback in enumerate(trainer.callbacks):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,8 @@ def main(cfg) -> None:

exp_manager(trainer, cfg.exp_manager)

# update resume from checkpoint found by exp_manager
resume_from_checkpoint = trainer.ckpt_path
# resume_from_checkpoint = uninject_model_parallel_rank(resume_from_checkpoint)
logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}')

trainer._checkpoint_connector = _CheckpointConnector(trainer)
logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')

# hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
with open_dict(cfg):
Expand Down
6 changes: 1 addition & 5 deletions examples/nlp/language_modeling/megatron_retro_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,8 @@ def main(cfg) -> None:

exp_manager(trainer, cfg.exp_manager)

# update resume from checkpoint found by exp_manager
resume_from_checkpoint = trainer.ckpt_path
# resume_from_checkpoint = uninject_model_parallel_rank(resume_from_checkpoint)
logging.info(f'Resuming training from checkpoint: {resume_from_checkpoint}')

trainer._checkpoint_connector = _CheckpointConnector(trainer)
logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')

# hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
with open_dict(cfg):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,6 @@ def main(cfg) -> None:
trainer.ckpt_path = cfg.model.resume_from_checkpoint
logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')

trainer._checkpoint_connector = _CheckpointConnector(trainer)

# hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
with open_dict(cfg):
cfg.model.precision = cfg.trainer.precision
Expand Down
2 changes: 0 additions & 2 deletions examples/nlp/language_modeling/megatron_t5_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,6 @@ def main(cfg) -> None:
trainer.ckpt_path = cfg.model.resume_from_checkpoint
logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')

trainer._checkpoint_connector = _CheckpointConnector(trainer)

# hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
with open_dict(cfg):
cfg.model.precision = cfg.trainer.precision
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,6 @@ def main(cfg) -> None:
trainer.ckpt_path = cfg.model.resume_from_checkpoint
logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')

trainer._checkpoint_connector = _CheckpointConnector(trainer)

if hasattr(cfg.model.data.train_ds, 'task_name'):
if cfg.model.restore_from_path:
t5_cfg = MegatronT5GLUEModel.restore_from(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,6 @@ def main(cfg) -> None:
trainer.ckpt_path = cfg.model.resume_from_checkpoint
logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')

trainer._checkpoint_connector = _CheckpointConnector(trainer)

# hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
with open_dict(cfg):
cfg.model.precision = cfg.trainer.precision
Expand Down
2 changes: 0 additions & 2 deletions examples/nlp/language_modeling/tuning/megatron_gpt_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,8 +183,6 @@ def main(cfg) -> None:
trainer.ckpt_path = cfg.model.resume_from_checkpoint
logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')

trainer._checkpoint_connector = _CheckpointConnector(trainer)

# hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
with open_dict(cfg):
cfg.model.precision = cfg.trainer.precision
Expand Down
9 changes: 3 additions & 6 deletions nemo/collections/nlp/parts/nlp_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -760,7 +760,6 @@ def optimizer_step(
self,
optimizer: torch.optim.Optimizer,
model: Union["pl.LightningModule", torch.nn.Module],
optimizer_idx: int,
closure: Callable[[], Any],
**kwargs: Any,
) -> None:
Expand All @@ -771,13 +770,11 @@ def optimizer_step(
if self.scaler is None:
assert optimizer.fp32_grad_accumulation, "BF16 uses FP32 grad accumulation"
_ = closure()
self._after_closure(model, optimizer, optimizer_idx)
self._after_closure(model, optimizer)
return optimizer.step(**kwargs)

if isinstance(optimizer, torch.optim.LBFGS):
raise MisconfigurationException(
f"Native AMP and the LBFGS optimizer are not compatible (optimizer {optimizer_idx})."
)
raise MisconfigurationException(f"Native AMP and the LBFGS optimizer are not compatible (optimizer).")
assert not optimizer.fp32_grad_accumulation, "FP16 uses FP16 grad accumulation"
closure_result = closure()

Expand All @@ -788,7 +785,7 @@ def optimizer_step(
# `unscale` after the closure is executed but before the `on_before_optimizer_step` hook.
# unscale main (fp32) gradients
self.scaler.unscale_(optimizer)
self._after_closure(model, optimizer, optimizer_idx)
self._after_closure(model, optimizer)
skipped_backward = closure_result is None
# in manual optimization, the closure does not return a value
if not isinstance(model, pl.LightningModule) or not model.automatic_optimization or not skipped_backward:
Expand Down
Loading