From ae2d0111469d5aaf7f386875d1ea26a1e4345203 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Tue, 5 Oct 2021 19:37:23 +0530 Subject: [PATCH 1/4] support_dict --- pytorch_lightning/plugins/training_type/deepspeed.py | 8 ++++---- tests/accelerators/test_ipu.py | 2 +- tests/plugins/test_deepspeed_plugin.py | 10 ++++++++++ 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 1785b3644e2c7..5435d07fbcf80 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -375,11 +375,11 @@ def pre_dispatch(self): self.barrier() def init_deepspeed(self): - accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches - if not isinstance(accumulate_grad_batches, int): + accumulation_scheduler = self.lightning_module.trainer.accumulation_scheduler + + if accumulation_scheduler.epochs != [0]: raise MisconfigurationException( - "DeepSpeed currently only supports `Trainer.accumulate_grad_batches` being an integer." - f" Received {accumulate_grad_batches}" + "DeepSpeed currently does not support different `accumulate_grad_batches` at different epoch." ) precision = self.lightning_module.trainer.accelerator.precision diff --git a/tests/accelerators/test_ipu.py b/tests/accelerators/test_ipu.py index ad8f4243418d1..acb3fd65959eb 100644 --- a/tests/accelerators/test_ipu.py +++ b/tests/accelerators/test_ipu.py @@ -308,7 +308,7 @@ def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, da @RunIf(ipu=True) -def test_accumulate_grad_batches_dict_fails(tmpdir): +def test_different_accumulate_grad_batches_fails(tmpdir): model = IPUModel() trainer = Trainer(default_root_dir=tmpdir, ipus=1, accumulate_grad_batches={1: 2}) with pytest.raises( diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 96e132d12c1c8..0691eb43d68b0 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -961,3 +961,13 @@ def configure_optimizers(self): else: # assert called once at init and once during training assert mock_step.call_count == 1 + (max_epoch * limit_train_batches) + + +@RunIf(deepspeed=True) +def test_different_accumulate_grad_batches_fails(tmpdir): + model = BoringModel() + trainer = Trainer(default_root_dir=tmpdir, ipus=1, accumulate_grad_batches={1: 2}) + with pytest.raises( + MisconfigurationException, match="DeepSpeed currently does not support different `accumulate_grad_batches`" + ): + trainer.fit(model) From fd3b61323e9778957ca7fbd35b4a909a5c56d3c3 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Tue, 5 Oct 2021 19:42:56 +0530 Subject: [PATCH 2/4] chlog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 84c55b4ec8a4a..db110a6e5feef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -249,6 +249,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Raise an exception if using `amp_level` with native `amp_backend` ([#9755](https://github.com/PyTorchLightning/pytorch-lightning/pull/9755)) +- Update the logic to check for accumulation steps with deepspeed ([#9826](https://github.com/PyTorchLightning/pytorch-lightning/pull/9826)) + + ### Deprecated - Deprecated `LightningModule.summarize()` in favor of `pytorch_lightning.utilities.model_summary.summarize()` From dd7fed865f8b974f8fcd5a30e3eb1b979938b996 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Tue, 5 Oct 2021 21:25:53 +0530 Subject: [PATCH 3/4] fix test --- tests/plugins/test_deepspeed_plugin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 0691eb43d68b0..9afcbcd5175da 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -963,10 +963,10 @@ def configure_optimizers(self): assert mock_step.call_count == 1 + (max_epoch * limit_train_batches) -@RunIf(deepspeed=True) +@RunIf(min_gpus=1, deepspeed=True, special=True) def test_different_accumulate_grad_batches_fails(tmpdir): model = BoringModel() - trainer = Trainer(default_root_dir=tmpdir, ipus=1, accumulate_grad_batches={1: 2}) + trainer = Trainer(default_root_dir=tmpdir, accumulate_grad_batches={1: 2}, gpus=1, plugins="deepspeed") with pytest.raises( MisconfigurationException, match="DeepSpeed currently does not support different `accumulate_grad_batches`" ): From 86a0bcf78f6e4429e97c84271ae67aa9652f8989 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Wed, 6 Oct 2021 14:19:53 +0530 Subject: [PATCH 4/4] epochs --- pytorch_lightning/plugins/training_type/deepspeed.py | 2 +- pytorch_lightning/plugins/training_type/ipu.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 5435d07fbcf80..f706e5f33346d 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -379,7 +379,7 @@ def init_deepspeed(self): if accumulation_scheduler.epochs != [0]: raise MisconfigurationException( - "DeepSpeed currently does not support different `accumulate_grad_batches` at different epoch." + "DeepSpeed currently does not support different `accumulate_grad_batches` at different epochs." ) precision = self.lightning_module.trainer.accelerator.precision diff --git a/pytorch_lightning/plugins/training_type/ipu.py b/pytorch_lightning/plugins/training_type/ipu.py index 6aa919f8c4c42..8849c22777589 100644 --- a/pytorch_lightning/plugins/training_type/ipu.py +++ b/pytorch_lightning/plugins/training_type/ipu.py @@ -193,7 +193,7 @@ def _handle_gradient_accumulation_steps(self) -> None: if accumulation_scheduler.epochs != [0]: raise MisconfigurationException( - "IPUs currently does not support different `accumulate_grad_batches` at different epoch." + "IPUs currently does not support different `accumulate_grad_batches` at different epochs." ) # TODO(@tchaton): Add support for accumulate_grad_batches being a dictionary