From c700cabcc985f9780843d5a28f9d5974be588017 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 22 Jun 2021 19:02:04 +0200
Subject: [PATCH 01/31] Add ddp training type teardown

---
 pytorch_lightning/plugins/training_type/ddp.py | 9 +++++++--
 tests/accelerators/test_ddp.py                 | 4 +---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index ed320f37d7006..1ebed7c55405b 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -227,7 +227,7 @@ def setup_distributed(self):
         self.init_ddp_connection()
 
         # on world_size=0 let everyone know training is starting
-        if self.is_global_zero and not torch.distributed.is_initialized():
+        if self.is_global_zero and not torch_distrib.is_initialized():
             log.info("-" * 100)
             log.info(f"distributed_backend={self.distributed_backend}")
             log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
@@ -297,7 +297,7 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt
         world_size = world_size if world_size is not None else self.cluster_environment.world_size()
         os.environ["MASTER_ADDR"] = self.cluster_environment.master_address()
         os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
-        if not torch.distributed.is_initialized():
+        if not torch_distrib.is_initialized():
             log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
             torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size)
 
@@ -373,3 +373,8 @@ def register_plugins(cls, plugin_registry: Dict) -> None:
             description="DDP Plugin with `find_unused_parameters` as False",
             find_unused_parameters=False
         )
+
+    def teardown(self) -> None:
+        if torch_distrib.is_initialized():
+            torch_distrib.destroy_process_group()
+        super().teardown()
diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py
index 3f335964a5eee..9f6b160567a84 100644
--- a/tests/accelerators/test_ddp.py
+++ b/tests/accelerators/test_ddp.py
@@ -109,7 +109,6 @@ class TestModel(BoringModel):
 
         def setup(self, stage: Optional[str] = None) -> None:
             assert torch.distributed.is_initialized()
-            raise SystemExit()
 
     model = TestModel()
     trainer = Trainer(
@@ -118,8 +117,7 @@ def setup(self, stage: Optional[str] = None) -> None:
         accelerator="ddp",
         gpus=1,
     )
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+    trainer.fit(model)
 
 
 @RunIf(min_gpus=2, min_torch="1.8.1", special=True)

From e5602c92942caf6d54b9e0b3ce7f2fd90783950a Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 22 Jun 2021 19:10:42 +0200
Subject: [PATCH 02/31] Update CHANGELOG

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 176413cd55e76..d2cd3926381e3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -285,6 +285,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Support manual optimization with DeepSpeed ([#7970](https://github.com/PyTorchLightning/pytorch-lightning/pull/7970))
 
 
+- Destroy the distributed process group on DDP teardown ([#8080](https://github.com/PyTorchLightning/pytorch-lightning/pull/8080))
+
+
 - Fixed `dataloader_idx` argument value when predicting with only one `DataLoader` ([#7941](https://github.com/PyTorchLightning/pytorch-lightning/pull/7941))
 
 

From 0b94b6c269cd04c3ec495a0beebd58bcda949b29 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 23 Jun 2021 14:12:08 +0200
Subject: [PATCH 03/31] Use destructor

---
 pytorch_lightning/plugins/training_type/ddp.py | 3 +--
 tests/accelerators/test_ddp.py                 | 4 +++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index 1ebed7c55405b..2ea19fb0c781b 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -374,7 +374,6 @@ def register_plugins(cls, plugin_registry: Dict) -> None:
             find_unused_parameters=False
         )
 
-    def teardown(self) -> None:
+    def __del__(self) -> None:
         if torch_distrib.is_initialized():
             torch_distrib.destroy_process_group()
-        super().teardown()
diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py
index 9f6b160567a84..3f335964a5eee 100644
--- a/tests/accelerators/test_ddp.py
+++ b/tests/accelerators/test_ddp.py
@@ -109,6 +109,7 @@ class TestModel(BoringModel):
 
         def setup(self, stage: Optional[str] = None) -> None:
             assert torch.distributed.is_initialized()
+            raise SystemExit()
 
     model = TestModel()
     trainer = Trainer(
@@ -117,7 +118,8 @@ def setup(self, stage: Optional[str] = None) -> None:
         accelerator="ddp",
         gpus=1,
     )
-    trainer.fit(model)
+    with pytest.raises(SystemExit):
+        trainer.fit(model)
 
 
 @RunIf(min_gpus=2, min_torch="1.8.1", special=True)

From aaf32abde54cfb1bf205ae8bea878e9ebe282ad3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Wed, 23 Jun 2021 14:14:54 +0200
Subject: [PATCH 04/31] Update CHANGELOG.md

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d2cd3926381e3..1c6633fe88508 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -285,7 +285,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Support manual optimization with DeepSpeed ([#7970](https://github.com/PyTorchLightning/pytorch-lightning/pull/7970))
 
 
-- Destroy the distributed process group on DDP teardown ([#8080](https://github.com/PyTorchLightning/pytorch-lightning/pull/8080))
+- Destroy the distributed process group on DDP destructor ([#8080](https://github.com/PyTorchLightning/pytorch-lightning/pull/8080))
 
 
 - Fixed `dataloader_idx` argument value when predicting with only one `DataLoader` ([#7941](https://github.com/PyTorchLightning/pytorch-lightning/pull/7941))

From 0444d541c67e4f236c2de084924bbf4fef36c5e9 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 23 Jun 2021 16:28:22 +0200
Subject: [PATCH 05/31] RPC destructor

---
 pytorch_lightning/plugins/training_type/rpc.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py
index 3e0f57daef001..d8698e71bd261 100644
--- a/pytorch_lightning/plugins/training_type/rpc.py
+++ b/pytorch_lightning/plugins/training_type/rpc.py
@@ -83,3 +83,7 @@ def exit_rpc_process(self):
     @property
     def rpc_enabled(self) -> bool:
         return True
+
+    def __del__(self):
+        self.exit_rpc_process()
+        super().__del__()

From 5d4f811cd865ee8952b7f388c487af671e919bbd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Wed, 23 Jun 2021 16:28:38 +0200
Subject: [PATCH 06/31] Update pytorch_lightning/plugins/training_type/ddp.py

---
 pytorch_lightning/plugins/training_type/ddp.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index 2ea19fb0c781b..11601dec1fc6c 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -377,3 +377,6 @@ def register_plugins(cls, plugin_registry: Dict) -> None:
     def __del__(self) -> None:
         if torch_distrib.is_initialized():
             torch_distrib.destroy_process_group()
+        # clean up memory
+        with torch.cuda.device(self.root_device):
+            torch.cuda.empty_cache()

From bf8766d392c021a4b20b4641ca9ee83bb042386c Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 23 Jun 2021 16:56:20 +0200
Subject: [PATCH 07/31] Why do you not work :(

---
 pytorch_lightning/plugins/training_type/rpc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py
index d8698e71bd261..f825732f7e316 100644
--- a/pytorch_lightning/plugins/training_type/rpc.py
+++ b/pytorch_lightning/plugins/training_type/rpc.py
@@ -85,5 +85,5 @@ def rpc_enabled(self) -> bool:
         return True
 
     def __del__(self):
-        self.exit_rpc_process()
-        super().__del__()
+        # avoid hang
+        ...

From 48bcb7ed2ea7ac463e0dc39e5808416c56957404 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 23 Jun 2021 18:05:16 +0200
Subject: [PATCH 08/31] Missing condition

---
 pytorch_lightning/plugins/training_type/ddp.py | 7 ++++---
 pytorch_lightning/plugins/training_type/rpc.py | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index 11601dec1fc6c..88fe86420069e 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -377,6 +377,7 @@ def register_plugins(cls, plugin_registry: Dict) -> None:
     def __del__(self) -> None:
         if torch_distrib.is_initialized():
             torch_distrib.destroy_process_group()
-        # clean up memory
-        with torch.cuda.device(self.root_device):
-            torch.cuda.empty_cache()
+        if self.on_gpu:
+            # clean up memory
+            with torch.cuda.device(self.root_device):
+                torch.cuda.empty_cache()
diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py
index f825732f7e316..f20ece7ebbcf7 100644
--- a/pytorch_lightning/plugins/training_type/rpc.py
+++ b/pytorch_lightning/plugins/training_type/rpc.py
@@ -86,4 +86,4 @@ def rpc_enabled(self) -> bool:
 
     def __del__(self):
         # avoid hang
-        ...
+        pass

From 21ad2d8234e053e08d97efd74a261fa76d6e8b56 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 04:46:44 +0200
Subject: [PATCH 09/31] Fix deepspeed test

---
 tests/plugins/test_deepspeed_plugin.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index c5eaadd1e5985..2e96ced4c0c26 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -1,3 +1,4 @@
+import gc
 import json
 import os
 from typing import Any, Dict
@@ -265,6 +266,10 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args
                                                     (RandomIterableDataset, "auto"), (RandomIterableDataset, 10)])
 def test_deepspeed_auto_batch_size_config_select(tmpdir, dataset_cls, value):
     """Test to ensure that the batch size is correctly set as expected for deepspeed logging purposes."""
+    # the previous parametrization can impact the current one as it's not guaranteed that resources will be released
+    # between parametrizations. This is important as we call `destroy_process_group` in `DDPPlugin.__del__`.
+    # Another option would be to not use `parametrize`: https://github.com/pytest-dev/pytest/discussions/8153
+    gc.collect()
 
     class TestModel(BoringModel):
 

From bbc489e313dccc95018d2f750a89f00c656a43d2 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 04:57:54 +0200
Subject: [PATCH 10/31] GC collect in conftest

---
 tests/conftest.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index 7f6407ecfd82b..6cbdc3c3783c2 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import gc
 import os
 import sys
 import threading
@@ -36,6 +37,9 @@ def restore_env_variables():
     """ Ensures that environment variables set during the test do not leak out. """
     env_backup = os.environ.copy()
     yield
+    # if a destructor accesses an environment variable, we need to make sure that `os.environ` is not cleared
+    # before `__del__` is called. Force the call by triggering garbage collection.
+    gc.collect()
     # restore environment as it was before running the test
     os.environ.clear()
     os.environ.update(env_backup)

From 5b06fd2c2528e29929e56fcc765ad708a8a77586 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 05:43:43 +0200
Subject: [PATCH 11/31] Do not show warnings for special tests

---
 tests/special_tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/special_tests.sh b/tests/special_tests.sh
index 9fca3b62bad40..a87f50548d06b 100755
--- a/tests/special_tests.sh
+++ b/tests/special_tests.sh
@@ -17,7 +17,7 @@ set -e
 # this environment variable allows special tests to run
 export PL_RUNNING_SPECIAL_TESTS=1
 # python arguments
-defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no'
+defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no --disable-warnings'
 
 # find tests marked as `@RunIf(special=True)`
 grep_output=$(grep --recursive --line-number --word-regexp 'tests' 'benchmarks' --regexp 'special=True')

From 5e69ed84f9b09bcd25fe70909dc94470287982c6 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 05:44:05 +0200
Subject: [PATCH 12/31] Needs to run on 1.8

To avoid: "RuntimeError: NCCL error in: /pytorch/torch/lib/c10d/ProcessGroupNCCL.cpp:32, unhandled cuda error, NCCL version 2.4.8"
---
 .azure-pipelines/gpu-tests.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index 5333bfd867da0..bc7120bbc2ae6 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -32,12 +32,9 @@ jobs:
     #      python.version: '3.7'
 
     # ToDo: this need to have installed docker in the base image...
-    #container: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6
-    #container: "pytorchlightning/pytorch_lightning:base-cuda-py$[ variables['python.version'] ]-torch1.6"
     container:
       # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.8-torch1.6"
-      #endpoint: azureContainerRegistryConnection
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all"
 
     workspace:

From aed51a2c09213267283455efefb50f111c33384b Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 15:45:22 +0200
Subject: [PATCH 13/31] Run torch 1.8

---
 .azure-pipelines/gpu-tests.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index 5333bfd867da0..bc7120bbc2ae6 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -32,12 +32,9 @@ jobs:
     #      python.version: '3.7'
 
     # ToDo: this need to have installed docker in the base image...
-    #container: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6
-    #container: "pytorchlightning/pytorch_lightning:base-cuda-py$[ variables['python.version'] ]-torch1.6"
     container:
       # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.8-torch1.6"
-      #endpoint: azureContainerRegistryConnection
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all"
 
     workspace:

From e0a3e8785d2fecd63667da433a648f958d60ef89 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 16:01:33 +0200
Subject: [PATCH 14/31] Skip test due to 'Python bus error'

---
 tests/helpers/test_models.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/helpers/test_models.py b/tests/helpers/test_models.py
index e4bb7e7df0827..61b33265d1458 100644
--- a/tests/helpers/test_models.py
+++ b/tests/helpers/test_models.py
@@ -23,11 +23,12 @@
 
 
 @pytest.mark.parametrize(
-    "data_class,model_class", [
+    "data_class,model_class",
+    [
         (None, BoringModel),
         (None, BasicGAN),
         (None, ParityModuleRNN),
-        (None, ParityModuleMNIST),
+        # (None, ParityModuleMNIST),
         (ClassifDataModule, ClassificationModel),
         (RegressDataModule, RegressionModel),
     ]

From 9ee2d193832d022dd95096e932476dedcbd990d4 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 16:34:26 +0200
Subject: [PATCH 15/31] Debug NCCL

---
 .azure-pipelines/gpu-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index bc7120bbc2ae6..f1b57f9233ae3 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -71,7 +71,7 @@ jobs:
       displayName: 'Get legacy checkpoints'
 
     - bash: |
-        python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
+        NCCL_DEBUG=INFO python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
       displayName: 'Testing: standard'
 
     - bash: |

From 3588aaa37723db12ee17969a80e4c90028c071ba Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 17:06:20 +0200
Subject: [PATCH 16/31] shm size

---
 .azure-pipelines/gpu-tests.yml | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index f1b57f9233ae3..421ad96688d5a 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -25,17 +25,11 @@ jobs:
 
     pool: gridai-spot-pool
 
-    #strategy:
-    #  matrix:
-    #    PT16:
-    #      torch.version: '1.6'
-    #      python.version: '3.7'
-
     # ToDo: this need to have installed docker in the base image...
     container:
       # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
       image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
-      options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all"
+      options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
 
     workspace:
       clean: all

From 067bf1ae9eee271aaf3c4e4ac6bf9a50ba807fa2 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 17:28:56 +0200
Subject: [PATCH 17/31] Disable warnings for special tests

---
 tests/special_tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/special_tests.sh b/tests/special_tests.sh
index 9fca3b62bad40..a87f50548d06b 100755
--- a/tests/special_tests.sh
+++ b/tests/special_tests.sh
@@ -17,7 +17,7 @@ set -e
 # this environment variable allows special tests to run
 export PL_RUNNING_SPECIAL_TESTS=1
 # python arguments
-defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no'
+defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no --disable-warnings'
 
 # find tests marked as `@RunIf(special=True)`
 grep_output=$(grep --recursive --line-number --word-regexp 'tests' 'benchmarks' --regexp 'special=True')

From 6060b05215f0b824944bcabb2d7a4f3440625a96 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 17:29:25 +0200
Subject: [PATCH 18/31] Remove NCCL_DEBUG statement

---
 .azure-pipelines/gpu-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index 421ad96688d5a..5499202bc690e 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -65,7 +65,7 @@ jobs:
       displayName: 'Get legacy checkpoints'
 
     - bash: |
-        NCCL_DEBUG=INFO python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
+        python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
       displayName: 'Testing: standard'
 
     - bash: |

From f0fa1b74d0790a397702305a8cdd93ad7bcf18b7 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 17:30:06 +0200
Subject: [PATCH 19/31] Try smaller shm size

---
 .azure-pipelines/gpu-tests.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index 5499202bc690e..b1fedd578bc85 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -28,8 +28,11 @@ jobs:
     # ToDo: this need to have installed docker in the base image...
     container:
       # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
+      # run on torch 1.8 as it's the LTS version
       image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
-      options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
+      # default shm size is 64m. Increase it to avoid:
+      # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
+      options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=256m"
 
     workspace:
       clean: all

From 6dd70381ce88f8ac3459de4b9795a875d596c9f5 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 17:31:05 +0200
Subject: [PATCH 20/31] Revert "Skip test due to 'Python bus error'"

This reverts commit e0a3e8785d2fecd63667da433a648f958d60ef89.
---
 tests/helpers/test_models.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/helpers/test_models.py b/tests/helpers/test_models.py
index 61b33265d1458..e4bb7e7df0827 100644
--- a/tests/helpers/test_models.py
+++ b/tests/helpers/test_models.py
@@ -23,12 +23,11 @@
 
 
 @pytest.mark.parametrize(
-    "data_class,model_class",
-    [
+    "data_class,model_class", [
         (None, BoringModel),
         (None, BasicGAN),
         (None, ParityModuleRNN),
-        # (None, ParityModuleMNIST),
+        (None, ParityModuleMNIST),
         (ClassifDataModule, ClassificationModel),
         (RegressDataModule, RegressionModel),
     ]

From 73e62f8aba385a3cad540c438fb500a46ded9648 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 18:15:47 +0200
Subject: [PATCH 21/31] README and adjust versions

---
 README.md                       | 4 ++--
 requirements/adjust_versions.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 7a540adadd327..78175f95c28fd 100644
--- a/README.md
+++ b/README.md
@@ -74,10 +74,10 @@ Lightning is rigorously tested across multiple GPUs, TPUs CPUs and against major
 
   <center>
 
-  | System / PyTorch ver. | 1.4 (min. req.) | 1.5 | 1.6 | 1.7 | 1.8 (latest) | 1.9 (nightly) |
+  | System / PyTorch ver. | 1.4 (min. req.) | 1.5 | 1.6 | 1.7 | 1.8 (LTS) | 1.9 (latest) |
   | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
   | Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) |
-  | Linux py3.7 [GPUs**] | - | - | [![Build Status](https://dev.azure.com/PytorchLightning/pytorch-lightning/_apis/build/status/PL.pytorch-lightning%20(GPUs)?branchName=master)](https://dev.azure.com/PytorchLightning/pytorch-lightning/_build/latest?definitionId=6&branchName=master) | - | - | - |
+  | Linux py3.7 [GPUs**] | - | - | - | - | [![Build Status](https://dev.azure.com/PytorchLightning/pytorch-lightning/_apis/build/status/PL.pytorch-lightning%20(GPUs)?branchName=master)](https://dev.azure.com/PytorchLightning/pytorch-lightning/_build/latest?definitionId=6&branchName=master) | - |
   | Linux py3.{6,7} [TPUs***] | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - |
   | Linux py3.{6,7,8,9} | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
   | OSX py3.{6,7,8,9} | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
diff --git a/requirements/adjust_versions.py b/requirements/adjust_versions.py
index a09128c6200db..84879b4e48a34 100644
--- a/requirements/adjust_versions.py
+++ b/requirements/adjust_versions.py
@@ -4,7 +4,8 @@
 from typing import Dict, Optional
 
 VERSIONS = [
-    dict(torch="1.9.0", torchvision="", torchtext=""),  # nightly
+    dict(torch="1.10.0", torchvision="", torchtext=""),  # nightly
+    dict(torch="1.9.0", torchvision="0.10.0", torchtext="0.10.0"),
     dict(torch="1.8.1", torchvision="0.9.1", torchtext="0.9.1"),
     dict(torch="1.8.0", torchvision="0.9.0", torchtext="0.9.0"),
     dict(torch="1.7.1", torchvision="0.8.2", torchtext="0.8.1"),

From 902ef02b95fee49275b60a04ac8dbe9d6f682933 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 18:22:21 +0200
Subject: [PATCH 22/31] Avoid self.on_gpu call

---
 pytorch_lightning/plugins/training_type/ddp.py | 6 ++----
 tests/conftest.py                              | 4 ----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index dcc78f7bc5d40..c04a4ab111a20 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -384,7 +384,5 @@ def register_plugins(cls, plugin_registry: Dict) -> None:
     def __del__(self) -> None:
         if torch_distrib.is_initialized():
             torch_distrib.destroy_process_group()
-        if self.on_gpu:
-            # clean up memory
-            with torch.cuda.device(self.root_device):
-                torch.cuda.empty_cache()
+        # `is_initialized` is checked inside and we already set the default device with `set_device(self.root_device)`
+        torch.cuda.empty_cache()
diff --git a/tests/conftest.py b/tests/conftest.py
index 6cbdc3c3783c2..7f6407ecfd82b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import gc
 import os
 import sys
 import threading
@@ -37,9 +36,6 @@ def restore_env_variables():
     """ Ensures that environment variables set during the test do not leak out. """
     env_backup = os.environ.copy()
     yield
-    # if a destructor accesses an environment variable, we need to make sure that `os.environ` is not cleared
-    # before `__del__` is called. Force the call by triggering garbage collection.
-    gc.collect()
     # restore environment as it was before running the test
     os.environ.clear()
     os.environ.update(env_backup)

From 4ce0f9a1feaa8e85e536f5dead658d17c65611c8 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 18:41:53 +0200
Subject: [PATCH 23/31] empty cache cleanup

---
 pytorch_lightning/accelerators/gpu.py             |  5 +----
 .../plugins/training_type/parallel.py             |  3 +--
 .../plugins/training_type/single_device.py        |  3 +--
 .../trainer/connectors/checkpoint_connector.py    | 14 +++-----------
 pytorch_lightning/utilities/memory.py             | 15 +++++++--------
 5 files changed, 13 insertions(+), 27 deletions(-)

diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index 7543a2b794b5d..1c5ff56d805a6 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -42,10 +42,7 @@ def setup(self, trainer: 'pl.Trainer', model: 'pl.LightningModule') -> None:
 
     def on_train_start(self) -> None:
         # clear cache before training
-        # use context because of:
-        # https://discuss.pytorch.org/t/out-of-memory-when-i-use-torch-cuda-empty-cache/57898
-        with torch.cuda.device(self.root_device):
-            torch.cuda.empty_cache()
+        torch.cuda.empty_cache()
 
     @staticmethod
     def set_nvidia_flags(local_rank: int) -> None:
diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py
index 09e48a760e868..122a1423c2817 100644
--- a/pytorch_lightning/plugins/training_type/parallel.py
+++ b/pytorch_lightning/plugins/training_type/parallel.py
@@ -132,5 +132,4 @@ def teardown(self) -> None:
             # GPU teardown
             self.lightning_module.cpu()
             # clean up memory
-            with torch.cuda.device(self.root_device):
-                torch.cuda.empty_cache()
+            torch.cuda.empty_cache()
diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py
index 1816f5838c948..d4a328902eba0 100644
--- a/pytorch_lightning/plugins/training_type/single_device.py
+++ b/pytorch_lightning/plugins/training_type/single_device.py
@@ -85,5 +85,4 @@ def teardown(self) -> None:
             # GPU teardown
             self.lightning_module.cpu()
             # clean up memory
-            with torch.cuda.device(self.root_device):
-                torch.cuda.empty_cache()
+            torch.cuda.empty_cache()
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index c2a0411c0df36..0bc3145a99e59 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -21,13 +21,7 @@
 
 import pytorch_lightning
 from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.utilities import (
-    _OMEGACONF_AVAILABLE,
-    DeviceType,
-    rank_zero_deprecation,
-    rank_zero_info,
-    rank_zero_warn,
-)
+from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, rank_zero_deprecation, rank_zero_info, rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.upgrade_checkpoint import KEYS_MAPPING as DEPRECATED_CHECKPOINT_KEYS
@@ -69,8 +63,7 @@ def resume_start(self) -> None:
             return
 
         # clear cache before restore
-        if self.trainer._device_type == DeviceType.GPU:
-            torch.cuda.empty_cache()
+        torch.cuda.empty_cache()
 
         # Try to read the checkpoint file at `checkpoint_path`. If not exist, do not restore checkpoint.
         fs = get_filesystem(checkpoint_path)
@@ -88,8 +81,7 @@ def resume_end(self) -> None:
         self._loaded_checkpoint = dict()
 
         # clear cache after restore
-        if self.trainer._device_type == DeviceType.GPU:
-            torch.cuda.empty_cache()
+        torch.cuda.empty_cache()
 
         # wait for all to catch up
         self.trainer.training_type_plugin.barrier("CheckpointConnector.resume_end")
diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py
index 6c01390a8c81e..0ae88e8995614 100644
--- a/pytorch_lightning/utilities/memory.py
+++ b/pytorch_lightning/utilities/memory.py
@@ -76,11 +76,10 @@ def is_out_of_cpu_memory(exception):
 def garbage_collection_cuda():
     """Garbage collection Torch (CUDA) memory."""
     gc.collect()
-    if torch.cuda.is_available():
-        try:
-            # This is the last thing that should cause an OOM error, but seemingly it can.
-            torch.cuda.empty_cache()
-        except RuntimeError as exception:
-            if not is_oom_error(exception):
-                # Only handle OOM errors
-                raise
+    try:
+        # This is the last thing that should cause an OOM error, but seemingly it can.
+        torch.cuda.empty_cache()
+    except RuntimeError as exception:
+        if not is_oom_error(exception):
+            # Only handle OOM errors
+            raise

From 738daa5bbccd5e00f65ea5f4c8b9218fea15839d Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 19:06:25 +0200
Subject: [PATCH 24/31] More garbage collection

---
 tests/plugins/test_deepspeed_plugin.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 2e96ced4c0c26..b609bc78d74fc 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -650,6 +650,8 @@ def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_opt
     """
     Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works.
     """
+    gc.collect()
+
     seed_everything(42)
 
     class VerificationCallback(Callback):

From 236aa97bf35af324a43ca4f729f7ebdecff5fa3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Thu, 24 Jun 2021 21:23:50 +0200
Subject: [PATCH 25/31] Unroll parametrizations

---
 tests/callbacks/test_pruning.py               | 41 ++++++++++++++++---
 .../test_checkpoint_callback_frequency.py     | 14 +++++--
 tests/plugins/test_deepspeed_plugin.py        | 16 +++++---
 3 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py
index f198b29d24e84..1a5ddad64106e 100644
--- a/tests/callbacks/test_pruning.py
+++ b/tests/callbacks/test_pruning.py
@@ -162,13 +162,44 @@ def test_pruning_callback(
 
 
 @RunIf(special=True, min_gpus=2)
-@pytest.mark.parametrize("parameters_to_prune", [False, True])
-@pytest.mark.parametrize("use_global_unstructured", [False, True])
-def test_pruning_callback_ddp(tmpdir, use_global_unstructured: bool, parameters_to_prune: bool):
+def test_pruning_callback_ddp_0(tmpdir):
     train_with_pruning_callback(
         tmpdir,
-        parameters_to_prune=parameters_to_prune,
-        use_global_unstructured=use_global_unstructured,
+        parameters_to_prune=False,
+        use_global_unstructured=False,
+        accelerator="ddp",
+        gpus=2,
+    )
+
+
+@RunIf(special=True, min_gpus=2)
+def test_pruning_callback_ddp_1(tmpdir):
+    train_with_pruning_callback(
+        tmpdir,
+        parameters_to_prune=False,
+        use_global_unstructured=True,
+        accelerator="ddp",
+        gpus=2,
+    )
+
+
+@RunIf(special=True, min_gpus=2)
+def test_pruning_callback_ddp_2(tmpdir):
+    train_with_pruning_callback(
+        tmpdir,
+        parameters_to_prune=True,
+        use_global_unstructured=False,
+        accelerator="ddp",
+        gpus=2,
+    )
+
+
+@RunIf(special=True, min_gpus=2)
+def test_pruning_callback_ddp_3(tmpdir):
+    train_with_pruning_callback(
+        tmpdir,
+        parameters_to_prune=True,
+        use_global_unstructured=True,
         accelerator="ddp",
         gpus=2,
     )
diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py
index 9fdd69dba7a9a..c5afecc2b4bf3 100644
--- a/tests/checkpointing/test_checkpoint_callback_frequency.py
+++ b/tests/checkpointing/test_checkpoint_callback_frequency.py
@@ -105,10 +105,18 @@ def training_step(self, batch, batch_idx):
     assert save_mock.call_count == expected
 
 
-@mock.patch('torch.save')
 @RunIf(special=True, min_gpus=2)
-@pytest.mark.parametrize(['k', 'epochs', 'val_check_interval', 'expected'], [(1, 1, 1.0, 1), (2, 2, 0.3, 5)])
-def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected):
+def test_top_k_ddp_0(tmpdir):
+    _top_k_ddp(tmpdir, k=1, epochs=1, val_check_interval=1.0, expected=1)
+
+
+@RunIf(special=True, min_gpus=2)
+def test_top_k_ddp_1(tmpdir):
+    _top_k_ddp(tmpdir, k=2, epochs=2, val_check_interval=0.3, expected=5)
+
+
+@mock.patch('torch.save')
+def _top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected):
 
     class TestModel(BoringModel):
 
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index b609bc78d74fc..b443827cac70c 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -644,14 +644,10 @@ def test_deepspeed_multigpu_stage_3_checkpointing_full_weights_manual(tmpdir):
     run_checkpoint_test(tmpdir, save_full_weights=True, automatic_optimization=False, accumulate_grad_batches=1)
 
 
-@RunIf(min_gpus=2, deepspeed=True, special=True)
-@pytest.mark.parametrize('offload_optimizer', [True, False])
-def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer):
+def _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer):
     """
     Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works.
     """
-    gc.collect()
-
     seed_everything(42)
 
     class VerificationCallback(Callback):
@@ -678,6 +674,16 @@ def on_train_batch_start(
     trainer.fit(model, datamodule=dm)
 
 
+@RunIf(min_gpus=2, deepspeed=True, special=True)
+def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir):
+    _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=False)
+
+
+@RunIf(min_gpus=2, deepspeed=True, special=True)
+def test_deepspeed_multigpu_stage_2_accumulated_grad_batches_offload_optimizer(tmpdir):
+    _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=True)
+
+
 @RunIf(min_gpus=2, deepspeed=True, special=True)
 def test_deepspeed_multigpu_test(tmpdir, deepspeed_config):
     """

From ffa532d3bb888606438577f98e7ae512fa28a0cd Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Fri, 25 Jun 2021 00:30:21 +0200
Subject: [PATCH 26/31] Do not reuse mock

---
 .../test_checkpoint_callback_frequency.py             | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py
index c5afecc2b4bf3..67db594aa2539 100644
--- a/tests/checkpointing/test_checkpoint_callback_frequency.py
+++ b/tests/checkpointing/test_checkpoint_callback_frequency.py
@@ -105,17 +105,18 @@ def training_step(self, batch, batch_idx):
     assert save_mock.call_count == expected
 
 
+@mock.patch('torch.save')
 @RunIf(special=True, min_gpus=2)
-def test_top_k_ddp_0(tmpdir):
-    _top_k_ddp(tmpdir, k=1, epochs=1, val_check_interval=1.0, expected=1)
+def test_top_k_ddp_0(save_mock, tmpdir):
+    _top_k_ddp(save_mock, tmpdir, k=1, epochs=1, val_check_interval=1.0, expected=1)
 
 
+@mock.patch('torch.save')
 @RunIf(special=True, min_gpus=2)
-def test_top_k_ddp_1(tmpdir):
-    _top_k_ddp(tmpdir, k=2, epochs=2, val_check_interval=0.3, expected=5)
+def test_top_k_ddp_1(save_mock, tmpdir):
+    _top_k_ddp(save_mock, tmpdir, k=2, epochs=2, val_check_interval=0.3, expected=5)
 
 
-@mock.patch('torch.save')
 def _top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected):
 
     class TestModel(BoringModel):

From 7a123543ab23c70920f9528a413479690a5d898e Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Fri, 25 Jun 2021 02:16:28 +0200
Subject: [PATCH 27/31] Remove abbreviation

---
 pytorch_lightning/plugins/training_type/ddp.py | 18 ++++++++++--------
 .../plugins/training_type/ddp_spawn.py         | 10 ++++++----
 .../plugins/training_type/horovod.py           |  4 ++--
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index c04a4ab111a20..0335ca357b055 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -21,7 +21,7 @@
 import __main__
 import numpy as np
 import torch
-import torch.distributed as torch_distrib
+import torch.distributed
 from torch.nn.parallel.distributed import DistributedDataParallel
 from torch.optim import Optimizer
 
@@ -234,7 +234,7 @@ def setup_distributed(self):
         self.init_ddp_connection()
 
         # on world_size=0 let everyone know training is starting
-        if self.is_global_zero and not torch_distrib.is_initialized():
+        if self.is_global_zero and not torch.distributed.is_initialized():
             log.info("-" * 100)
             log.info(f"distributed_backend={self.distributed_backend}")
             log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
@@ -304,9 +304,11 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt
         world_size = world_size if world_size is not None else self.cluster_environment.world_size()
         os.environ["MASTER_ADDR"] = self.cluster_environment.master_address()
         os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
-        if not torch_distrib.is_initialized():
+        if not torch.distributed.is_initialized():
             log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
-            torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size)
+            torch.distributed.init_process_group(
+                self.torch_distributed_backend, rank=global_rank, world_size=world_size
+            )
 
     def pre_dispatch(self):
         # move the model to the correct device
@@ -323,8 +325,8 @@ def post_dispatch(self) -> None:
         self.cluster_environment.teardown()
 
     def barrier(self, *args, **kwargs):
-        if torch_distrib.is_available() and torch_distrib.is_initialized():
-            torch_distrib.barrier()
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
 
     def broadcast(self, obj: object, src: int = 0) -> object:
         return self.dist.broadcast(obj)
@@ -382,7 +384,7 @@ def register_plugins(cls, plugin_registry: Dict) -> None:
         )
 
     def __del__(self) -> None:
-        if torch_distrib.is_initialized():
-            torch_distrib.destroy_process_group()
+        if torch.distributed.is_initialized():
+            torch.distributed.destroy_process_group()
         # `is_initialized` is checked inside and we already set the default device with `set_device(self.root_device)`
         torch.cuda.empty_cache()
diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
index 47f2a64c04759..d3a5acce112c1 100644
--- a/pytorch_lightning/plugins/training_type/ddp_spawn.py
+++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -17,7 +17,7 @@
 from typing import Any, List, Optional, Union
 
 import torch
-import torch.distributed as torch_distrib
+import torch.distributed
 import torch.multiprocessing as mp
 from torch.nn.parallel.distributed import DistributedDataParallel
 from torch.optim import Optimizer
@@ -265,7 +265,9 @@ def init_ddp_connection(self, global_rank: Optional[int], world_size: Optional[i
 
         if not torch.distributed.is_initialized():
             log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
-            torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size)
+            torch.distributed.init_process_group(
+                self.torch_distributed_backend, rank=global_rank, world_size=world_size
+            )
 
     def determine_ddp_device_ids(self):
         if self.root_device.type == "cpu":
@@ -306,8 +308,8 @@ def __recover_child_process_weights(self, best_path, last_path):
             self.lightning_module.load_state_dict(ckpt)
 
     def barrier(self, *args, **kwargs):
-        if torch_distrib.is_initialized():
-            torch_distrib.barrier()
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
 
     def broadcast(self, obj: object, src: int = 0) -> object:
         return self.dist.broadcast(obj)
diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py
index 99899aed11753..cbd9e80dabc23 100644
--- a/pytorch_lightning/plugins/training_type/horovod.py
+++ b/pytorch_lightning/plugins/training_type/horovod.py
@@ -15,7 +15,7 @@
 from typing import Any, List, Optional, Union
 
 import torch
-import torch.distributed as torch_distrib
+import torch.distributed
 from torch.optim.lr_scheduler import _LRScheduler, Optimizer
 
 from pytorch_lightning.core.optimizer import LightningOptimizer
@@ -125,7 +125,7 @@ def start_predicting(self, trainer):
         self.join()
 
     def barrier(self, *args, **kwargs):
-        if torch_distrib.is_initialized():
+        if torch.distributed.is_initialized():
             self.join()
 
     def broadcast(self, obj: object, src: int = 0) -> object:

From 74d8a7d5cc0f0043c4551f393dc2b4a17ce53625 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 29 Jun 2021 17:45:34 +0200
Subject: [PATCH 28/31] Has initialized ddp

---
 pytorch_lightning/plugins/training_type/ddp.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index 89ae36227d884..8957967ec31a2 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -100,6 +100,7 @@ def __init__(
         self._ddp_comm_wrapper = ddp_comm_wrapper
         self._pids: Optional[List[int]] = None
         self._sync_dir: Optional[str] = None
+        self._has_initialized_ddp: bool = False
         self.set_world_ranks()
 
     @property
@@ -310,7 +311,7 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt
             torch.distributed.init_process_group(
                 self.torch_distributed_backend, rank=global_rank, world_size=world_size
             )
-
+            self._has_initialized_ddp = True
             # on rank=0 let everyone know training is starting
             rank_zero_info(
                 f"{'-' * 100}\n"
@@ -335,12 +336,12 @@ def post_dispatch(self) -> None:
         self.cluster_environment.teardown()
 
     def barrier(self, *args, **kwargs) -> None:
-        if not torch_distrib.is_initialized():
+        if not torch.distributed.is_initialized():
             return
         if _TORCH_GREATER_EQUAL_1_8 and torch.distributed.get_backend() == "nccl":
-            torch_distrib.barrier(device_ids=self.determine_ddp_device_ids())
+            torch.distributed.barrier(device_ids=self.determine_ddp_device_ids())
         else:
-            torch_distrib.barrier()
+            torch.distributed.barrier()
 
     def broadcast(self, obj: object, src: int = 0) -> object:
         return self.dist.broadcast(obj)
@@ -436,7 +437,6 @@ def reconciliate_processes(self, trace: str):
             raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}")
 
     def __del__(self) -> None:
-        if torch.distributed.is_initialized():
+        if torch.distributed.is_initialized() and self._has_initialized_ddp:
             torch.distributed.destroy_process_group()
-        # `is_initialized` is checked inside and we already set the default device with `set_device(self.root_device)`
         torch.cuda.empty_cache()

From 3c2ac1063195b77e9e1dc91b9660022bbcd056f8 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Fri, 2 Jul 2021 16:08:11 +0200
Subject: [PATCH 29/31] Merge master

---
 pytorch_lightning/plugins/training_type/ddp.py | 2 +-
 tests/conftest.py                              | 9 ---------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index bc6fd76429185..f66c1ae00c15f 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -453,6 +453,6 @@ def reconciliate_processes(self, trace: str):
             raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}")
 
     def __del__(self) -> None:
-        if torch.distributed.is_initialized() and self._has_initialized_ddp:
+        if distributed_available() and self._has_initialized_ddp:
             torch.distributed.destroy_process_group()
         torch.cuda.empty_cache()
diff --git a/tests/conftest.py b/tests/conftest.py
index 3f767d8b6fad2..7f6407ecfd82b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,7 +18,6 @@
 from http.server import SimpleHTTPRequestHandler
 
 import pytest
-import torch.distributed
 import torch.multiprocessing as mp
 
 
@@ -42,14 +41,6 @@ def restore_env_variables():
     os.environ.update(env_backup)
 
 
-@pytest.fixture(scope="function", autouse=True)
-def teardown_process_group():
-    """ Ensures that the distributed process group gets closed before the next test runs. """
-    yield
-    if torch.distributed.is_available() and torch.distributed.is_initialized():
-        torch.distributed.destroy_process_group()
-
-
 def pytest_configure(config):
     config.addinivalue_line("markers", "spawn: spawn test in a separate process using torch.multiprocessing.spawn")
 

From 9ee6ee2d260a670ae2941231c6ba90c44a47b495 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Fri, 2 Jul 2021 16:09:02 +0200
Subject: [PATCH 30/31] Merge master

---
 tests/plugins/test_deepspeed_plugin.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 8f24c17bcb6a1..6c238ab747350 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -684,16 +684,6 @@ def on_train_batch_start(
     trainer.fit(model, datamodule=dm)
 
 
-@RunIf(min_gpus=2, deepspeed=True, special=True)
-def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir):
-    _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=False)
-
-
-@RunIf(min_gpus=2, deepspeed=True, special=True)
-def test_deepspeed_multigpu_stage_2_accumulated_grad_batches_offload_optimizer(tmpdir):
-    _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=True)
-
-
 @RunIf(min_gpus=2, deepspeed=True, special=True)
 def test_deepspeed_multigpu_test(tmpdir, deepspeed_config):
     """

From fc6338ea698ae33aa542835b79be10ef2d823939 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Fri, 2 Jul 2021 16:10:05 +0200
Subject: [PATCH 31/31] Unnecessary annotation

---
 pytorch_lightning/plugins/training_type/ddp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index f66c1ae00c15f..3afd5c7d4097a 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -106,7 +106,7 @@ def __init__(
         self._ddp_comm_wrapper = ddp_comm_wrapper
         self._pids: Optional[List[int]] = None
         self._sync_dir: Optional[str] = None
-        self._has_initialized_ddp: bool = False
+        self._has_initialized_ddp = False
         self.set_world_ranks()
 
     @property