From c8563cdff380aee0fcb5e37add43ec82c0cce910 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Sat, 3 Apr 2021 21:08:23 -0700 Subject: [PATCH 1/5] Update accelerator.py --- pytorch_lightning/accelerators/accelerator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index e97450fdbd8852..57461952aa17d8 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -140,7 +140,7 @@ def teardown(self) -> None: """This method is called to teardown the training process. It is the right place to release memory and free other ressources. """ - pass + self.barrier() def batch_to_device(self, batch: Any, device: Optional[torch.device] = None) -> Any: """Moves the batch to the correct device. From c9377a19e87758c055bd7a4957c6644f02cafea2 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Sun, 4 Apr 2021 08:47:59 -0700 Subject: [PATCH 2/5] Update pytorch_lightning/accelerators/accelerator.py Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> --- pytorch_lightning/accelerators/accelerator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 57461952aa17d8..408a89d752ac11 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -140,7 +140,7 @@ def teardown(self) -> None: """This method is called to teardown the training process. It is the right place to release memory and free other ressources. """ - self.barrier() + self.barrier("teardown") def batch_to_device(self, batch: Any, device: Optional[torch.device] = None) -> Any: """Moves the batch to the correct device. From 7e9e49c29c1d89f3bd08fc8d527dcd2189fbbc5d Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 7 Apr 2021 10:43:43 -0700 Subject: [PATCH 3/5] add-test --- CHANGELOG.md | 3 +++ pytorch_lightning/accelerators/accelerator.py | 10 ++++++- tests/trainer/test_trainer.py | 27 +++++++++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d9cab7a63eca6e..24e74e0a177b11 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -177,6 +177,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Add a barrier in the accelerator `teardown` to synchronize processes before execution finishes ([#6814](https://github.com/PyTorchLightning/pytorch-lightning/pull/6814)) + + - Set better defaults for `rank_zero_only.rank` when training is launched with SLURM and torchelastic ([#6802](https://github.com/PyTorchLightning/pytorch-lightning/pull/6802/)) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 408a89d752ac11..c2b5631df76ad1 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -138,7 +138,15 @@ def root_device(self) -> torch.device: def teardown(self) -> None: """This method is called to teardown the training process. - It is the right place to release memory and free other ressources. + It is the right place to release memory and free other resources. + + By default, we add a barrier here to synchronize processes at the end of execution + before returning control back to the calling code. This prevents race conditions in patterns like this: + + ``` + trainer.fit(module) + trainer.test() <-- best checkpoint path needs to be available on all ranks + ``` """ self.barrier("teardown") diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index ed255d690ed834..dc4309624457d1 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1438,7 +1438,9 @@ def setup(self, model, stage): ) @patch("pytorch_lightning.loggers.tensorboard.TensorBoardLogger.log_metrics") def test_log_every_n_steps(log_metrics_mock, tmpdir, train_batches, max_steps, log_interval): + class TestModel(BoringModel): + def training_step(self, *args, **kwargs): self.log("foo", -1) return super().training_step(*args, **kwargs) @@ -1888,3 +1890,28 @@ def test_exception_when_testing_or_validating_with_fast_dev_run(tmpdir): trainer.validate() with pytest.raises(MisconfigurationException, match=r"\.test\(\)` with `fast_dev_run=True"): trainer.test() + + +@RunIf(skip_windows=True, min_gpus=2, special=True) +def test_fit_test_synchronization(tmpdir): + """Test that the trainer synchronizes processes before returning control back to the caller. """ + + class TestModel(BoringModel): + + def validation_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.log('x', loss) + + model = TestModel() + checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor='x', mode='min', save_top_k=1) + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=2, + accelerator='ddp', + gpus=2, + callbacks=[checkpoint], + ) + trainer.fit(model) + assert os.path.exists(checkpoint.best_model_path), f'Could not find checkpoint at rank {trainer.global_rank}' + trainer.test() From 27db04863aab4399943fdaba5e789b88e798250b Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 7 Apr 2021 13:04:54 -0700 Subject: [PATCH 4/5] Update test_trainer.py --- tests/trainer/test_trainer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index dc4309624457d1..bd8b4f1532ebdf 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1902,9 +1902,14 @@ def validation_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) self.log('x', loss) + return {"x": loss} + + def validation_epoch_end(self, outputs) -> None: + avg_x = torch.stack([x['x'] for x in outputs]).mean() + self.log('avg_x', avg_x) model = TestModel() - checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor='x', mode='min', save_top_k=1) + checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor='avg_x', mode='min', save_top_k=1) trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, From 5d8319364e90a40703343a6a2cd60a9dd15e6331 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 7 Apr 2021 14:07:57 -0700 Subject: [PATCH 5/5] Update test_trainer.py --- tests/trainer/test_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index bd8b4f1532ebdf..7f1bc8d5263619 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1906,7 +1906,7 @@ def validation_step(self, batch, batch_idx): def validation_epoch_end(self, outputs) -> None: avg_x = torch.stack([x['x'] for x in outputs]).mean() - self.log('avg_x', avg_x) + self.log('avg_x', avg_x, sync_dist=True) model = TestModel() checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor='avg_x', mode='min', save_top_k=1)