From 7e9e49c29c1d89f3bd08fc8d527dcd2189fbbc5d Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 7 Apr 2021 10:43:43 -0700 Subject: [PATCH] add-test --- CHANGELOG.md | 3 +++ pytorch_lightning/accelerators/accelerator.py | 10 ++++++- tests/trainer/test_trainer.py | 27 +++++++++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d9cab7a63eca6e..24e74e0a177b11 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -177,6 +177,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Add a barrier in the accelerator `teardown` to synchronize processes before execution finishes ([#6814](https://github.com/PyTorchLightning/pytorch-lightning/pull/6814)) + + - Set better defaults for `rank_zero_only.rank` when training is launched with SLURM and torchelastic ([#6802](https://github.com/PyTorchLightning/pytorch-lightning/pull/6802/)) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 408a89d752ac11..c2b5631df76ad1 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -138,7 +138,15 @@ def root_device(self) -> torch.device: def teardown(self) -> None: """This method is called to teardown the training process. - It is the right place to release memory and free other ressources. + It is the right place to release memory and free other resources. + + By default, we add a barrier here to synchronize processes at the end of execution + before returning control back to the calling code. This prevents race conditions in patterns like this: + + ``` + trainer.fit(module) + trainer.test() <-- best checkpoint path needs to be available on all ranks + ``` """ self.barrier("teardown") diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index ed255d690ed834..dc4309624457d1 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1438,7 +1438,9 @@ def setup(self, model, stage): ) @patch("pytorch_lightning.loggers.tensorboard.TensorBoardLogger.log_metrics") def test_log_every_n_steps(log_metrics_mock, tmpdir, train_batches, max_steps, log_interval): + class TestModel(BoringModel): + def training_step(self, *args, **kwargs): self.log("foo", -1) return super().training_step(*args, **kwargs) @@ -1888,3 +1890,28 @@ def test_exception_when_testing_or_validating_with_fast_dev_run(tmpdir): trainer.validate() with pytest.raises(MisconfigurationException, match=r"\.test\(\)` with `fast_dev_run=True"): trainer.test() + + +@RunIf(skip_windows=True, min_gpus=2, special=True) +def test_fit_test_synchronization(tmpdir): + """Test that the trainer synchronizes processes before returning control back to the caller. """ + + class TestModel(BoringModel): + + def validation_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.log('x', loss) + + model = TestModel() + checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor='x', mode='min', save_top_k=1) + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=2, + accelerator='ddp', + gpus=2, + callbacks=[checkpoint], + ) + trainer.fit(model) + assert os.path.exists(checkpoint.best_model_path), f'Could not find checkpoint at rank {trainer.global_rank}' + trainer.test()