From c8563cdff380aee0fcb5e37add43ec82c0cce910 Mon Sep 17 00:00:00 2001
From: ananthsub <ananth.subramaniam@gmail.com>
Date: Sat, 3 Apr 2021 21:08:23 -0700
Subject: [PATCH 1/5] Update accelerator.py

---
 pytorch_lightning/accelerators/accelerator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index e97450fdbd8852..57461952aa17d8 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -140,7 +140,7 @@ def teardown(self) -> None:
         """This method is called to teardown the training process.
         It is the right place to release memory and free other ressources.
         """
-        pass
+        self.barrier()
 
     def batch_to_device(self, batch: Any, device: Optional[torch.device] = None) -> Any:
         """Moves the batch to the correct device.

From c9377a19e87758c055bd7a4957c6644f02cafea2 Mon Sep 17 00:00:00 2001
From: ananthsub <ananth.subramaniam@gmail.com>
Date: Sun, 4 Apr 2021 08:47:59 -0700
Subject: [PATCH 2/5] Update pytorch_lightning/accelerators/accelerator.py

Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
---
 pytorch_lightning/accelerators/accelerator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 57461952aa17d8..408a89d752ac11 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -140,7 +140,7 @@ def teardown(self) -> None:
         """This method is called to teardown the training process.
         It is the right place to release memory and free other ressources.
         """
-        self.barrier()
+        self.barrier("teardown")
 
     def batch_to_device(self, batch: Any, device: Optional[torch.device] = None) -> Any:
         """Moves the batch to the correct device.

From 7e9e49c29c1d89f3bd08fc8d527dcd2189fbbc5d Mon Sep 17 00:00:00 2001
From: ananthsub <ananth.subramaniam@gmail.com>
Date: Wed, 7 Apr 2021 10:43:43 -0700
Subject: [PATCH 3/5] add-test

---
 CHANGELOG.md                                  |  3 +++
 pytorch_lightning/accelerators/accelerator.py | 10 ++++++-
 tests/trainer/test_trainer.py                 | 27 +++++++++++++++++++
 3 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d9cab7a63eca6e..24e74e0a177b11 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -177,6 +177,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Add a barrier in the accelerator `teardown` to synchronize processes before execution finishes ([#6814](https://github.com/PyTorchLightning/pytorch-lightning/pull/6814))
+
+
 - Set better defaults for `rank_zero_only.rank` when training is launched with SLURM and torchelastic ([#6802](https://github.com/PyTorchLightning/pytorch-lightning/pull/6802/))
 
 
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 408a89d752ac11..c2b5631df76ad1 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -138,7 +138,15 @@ def root_device(self) -> torch.device:
 
     def teardown(self) -> None:
         """This method is called to teardown the training process.
-        It is the right place to release memory and free other ressources.
+        It is the right place to release memory and free other resources.
+
+        By default, we add a barrier here to synchronize processes at the end of execution
+        before returning control back to the calling code. This prevents race conditions in patterns like this:
+
+        ```
+            trainer.fit(module)
+            trainer.test()  <-- best checkpoint path needs to be available on all ranks
+        ```
         """
         self.barrier("teardown")
 
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index ed255d690ed834..dc4309624457d1 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1438,7 +1438,9 @@ def setup(self, model, stage):
 )
 @patch("pytorch_lightning.loggers.tensorboard.TensorBoardLogger.log_metrics")
 def test_log_every_n_steps(log_metrics_mock, tmpdir, train_batches, max_steps, log_interval):
+
     class TestModel(BoringModel):
+
         def training_step(self, *args, **kwargs):
             self.log("foo", -1)
             return super().training_step(*args, **kwargs)
@@ -1888,3 +1890,28 @@ def test_exception_when_testing_or_validating_with_fast_dev_run(tmpdir):
         trainer.validate()
     with pytest.raises(MisconfigurationException, match=r"\.test\(\)` with `fast_dev_run=True"):
         trainer.test()
+
+
+@RunIf(skip_windows=True, min_gpus=2, special=True)
+def test_fit_test_synchronization(tmpdir):
+    """Test that the trainer synchronizes processes before returning control back to the caller. """
+
+    class TestModel(BoringModel):
+
+        def validation_step(self, batch, batch_idx):
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            self.log('x', loss)
+
+    model = TestModel()
+    checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor='x', mode='min', save_top_k=1)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=2,
+        accelerator='ddp',
+        gpus=2,
+        callbacks=[checkpoint],
+    )
+    trainer.fit(model)
+    assert os.path.exists(checkpoint.best_model_path), f'Could not find checkpoint at rank {trainer.global_rank}'
+    trainer.test()

From 27db04863aab4399943fdaba5e789b88e798250b Mon Sep 17 00:00:00 2001
From: ananthsub <ananth.subramaniam@gmail.com>
Date: Wed, 7 Apr 2021 13:04:54 -0700
Subject: [PATCH 4/5] Update test_trainer.py

---
 tests/trainer/test_trainer.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index dc4309624457d1..bd8b4f1532ebdf 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1902,9 +1902,14 @@ def validation_step(self, batch, batch_idx):
             output = self.layer(batch)
             loss = self.loss(batch, output)
             self.log('x', loss)
+            return {"x": loss}
+
+        def validation_epoch_end(self, outputs) -> None:
+            avg_x = torch.stack([x['x'] for x in outputs]).mean()
+            self.log('avg_x', avg_x)
 
     model = TestModel()
-    checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor='x', mode='min', save_top_k=1)
+    checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor='avg_x', mode='min', save_top_k=1)
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=2,

From 5d8319364e90a40703343a6a2cd60a9dd15e6331 Mon Sep 17 00:00:00 2001
From: ananthsub <ananth.subramaniam@gmail.com>
Date: Wed, 7 Apr 2021 14:07:57 -0700
Subject: [PATCH 5/5] Update test_trainer.py

---
 tests/trainer/test_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index bd8b4f1532ebdf..7f1bc8d5263619 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1906,7 +1906,7 @@ def validation_step(self, batch, batch_idx):
 
         def validation_epoch_end(self, outputs) -> None:
             avg_x = torch.stack([x['x'] for x in outputs]).mean()
-            self.log('avg_x', avg_x)
+            self.log('avg_x', avg_x, sync_dist=True)
 
     model = TestModel()
     checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor='avg_x', mode='min', save_top_k=1)