diff --git a/CHANGELOG.md b/CHANGELOG.md index 84c55b4ec8a4a..db110a6e5feef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -249,6 +249,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Raise an exception if using `amp_level` with native `amp_backend` ([#9755](https://github.com/PyTorchLightning/pytorch-lightning/pull/9755)) +- Update the logic to check for accumulation steps with deepspeed ([#9826](https://github.com/PyTorchLightning/pytorch-lightning/pull/9826)) + + ### Deprecated - Deprecated `LightningModule.summarize()` in favor of `pytorch_lightning.utilities.model_summary.summarize()` diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 1785b3644e2c7..f706e5f33346d 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -375,11 +375,11 @@ def pre_dispatch(self): self.barrier() def init_deepspeed(self): - accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches - if not isinstance(accumulate_grad_batches, int): + accumulation_scheduler = self.lightning_module.trainer.accumulation_scheduler + + if accumulation_scheduler.epochs != [0]: raise MisconfigurationException( - "DeepSpeed currently only supports `Trainer.accumulate_grad_batches` being an integer." - f" Received {accumulate_grad_batches}" + "DeepSpeed currently does not support different `accumulate_grad_batches` at different epochs." ) precision = self.lightning_module.trainer.accelerator.precision diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 6aa919f8c4c42..8849c22777589 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -193,7 +193,7 @@ def _handle_gradient_accumulation_steps(self) -> None: if accumulation_scheduler.epochs != [0]: raise MisconfigurationException( - "IPUs currently does not support different `accumulate_grad_batches` at different epoch." + "IPUs currently does not support different `accumulate_grad_batches` at different epochs." ) # TODO(@tchaton): Add support for accumulate_grad_batches being a dictionary diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index ad8f4243418d1..acb3fd65959eb 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -308,7 +308,7 @@ def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, da @RunIf(ipu=True) -def test_accumulate_grad_batches_dict_fails(tmpdir): +def test_different_accumulate_grad_batches_fails(tmpdir): model = IPUModel() trainer = Trainer(default_root_dir=tmpdir, ipus=1, accumulate_grad_batches={1: 2}) with pytest.raises( diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 96e132d12c1c8..9afcbcd5175da 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -961,3 +961,13 @@ def configure_optimizers(self): else: # assert called once at init and once during training assert mock_step.call_count == 1 + (max_epoch * limit_train_batches) + + +@RunIf(min_gpus=1, deepspeed=True, special=True) +def test_different_accumulate_grad_batches_fails(tmpdir): + model = BoringModel() + trainer = Trainer(default_root_dir=tmpdir, accumulate_grad_batches={1: 2}, gpus=1, plugins="deepspeed") + with pytest.raises( + MisconfigurationException, match="DeepSpeed currently does not support different `accumulate_grad_batches`" + ): + trainer.fit(model)