From 7e9e49c29c1d89f3bd08fc8d527dcd2189fbbc5d Mon Sep 17 00:00:00 2001
From: ananthsub <ananth.subramaniam@gmail.com>
Date: Wed, 7 Apr 2021 10:43:43 -0700
Subject: [PATCH] add-test

---
 CHANGELOG.md                                  |  3 +++
 pytorch_lightning/accelerators/accelerator.py | 10 ++++++-
 tests/trainer/test_trainer.py                 | 27 +++++++++++++++++++
 3 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d9cab7a63eca6e..24e74e0a177b11 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -177,6 +177,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Add a barrier in the accelerator `teardown` to synchronize processes before execution finishes ([#6814](https://github.com/PyTorchLightning/pytorch-lightning/pull/6814))
+
+
 - Set better defaults for `rank_zero_only.rank` when training is launched with SLURM and torchelastic ([#6802](https://github.com/PyTorchLightning/pytorch-lightning/pull/6802/))
 
 
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 408a89d752ac11..c2b5631df76ad1 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -138,7 +138,15 @@ def root_device(self) -> torch.device:
 
     def teardown(self) -> None:
         """This method is called to teardown the training process.
-        It is the right place to release memory and free other ressources.
+        It is the right place to release memory and free other resources.
+
+        By default, we add a barrier here to synchronize processes at the end of execution
+        before returning control back to the calling code. This prevents race conditions in patterns like this:
+
+        ```
+            trainer.fit(module)
+            trainer.test()  <-- best checkpoint path needs to be available on all ranks
+        ```
         """
         self.barrier("teardown")
 
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index ed255d690ed834..dc4309624457d1 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1438,7 +1438,9 @@ def setup(self, model, stage):
 )
 @patch("pytorch_lightning.loggers.tensorboard.TensorBoardLogger.log_metrics")
 def test_log_every_n_steps(log_metrics_mock, tmpdir, train_batches, max_steps, log_interval):
+
     class TestModel(BoringModel):
+
         def training_step(self, *args, **kwargs):
             self.log("foo", -1)
             return super().training_step(*args, **kwargs)
@@ -1888,3 +1890,28 @@ def test_exception_when_testing_or_validating_with_fast_dev_run(tmpdir):
         trainer.validate()
     with pytest.raises(MisconfigurationException, match=r"\.test\(\)` with `fast_dev_run=True"):
         trainer.test()
+
+
+@RunIf(skip_windows=True, min_gpus=2, special=True)
+def test_fit_test_synchronization(tmpdir):
+    """Test that the trainer synchronizes processes before returning control back to the caller. """
+
+    class TestModel(BoringModel):
+
+        def validation_step(self, batch, batch_idx):
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            self.log('x', loss)
+
+    model = TestModel()
+    checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor='x', mode='min', save_top_k=1)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=2,
+        accelerator='ddp',
+        gpus=2,
+        callbacks=[checkpoint],
+    )
+    trainer.fit(model)
+    assert os.path.exists(checkpoint.best_model_path), f'Could not find checkpoint at rank {trainer.global_rank}'
+    trainer.test()