From c700cabcc985f9780843d5a28f9d5974be588017 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 22 Jun 2021 19:02:04 +0200 Subject: [PATCH 01/31] Add ddp training type teardown --- pytorch_lightning/plugins/training_type/ddp.py | 9 +++++++-- tests/accelerators/test_ddp.py | 4 +--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index ed320f37d7006..1ebed7c55405b 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -227,7 +227,7 @@ def setup_distributed(self): self.init_ddp_connection() # on world_size=0 let everyone know training is starting - if self.is_global_zero and not torch.distributed.is_initialized(): + if self.is_global_zero and not torch_distrib.is_initialized(): log.info("-" * 100) log.info(f"distributed_backend={self.distributed_backend}") log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") @@ -297,7 +297,7 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt world_size = world_size if world_size is not None else self.cluster_environment.world_size() os.environ["MASTER_ADDR"] = self.cluster_environment.master_address() os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) - if not torch.distributed.is_initialized(): + if not torch_distrib.is_initialized(): log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size) @@ -373,3 +373,8 @@ def register_plugins(cls, plugin_registry: Dict) -> None: description="DDP Plugin with `find_unused_parameters` as False", find_unused_parameters=False ) + + def teardown(self) -> None: + if torch_distrib.is_initialized(): + torch_distrib.destroy_process_group() + super().teardown() diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py index 3f335964a5eee..9f6b160567a84 100644 --- a/tests/accelerators/test_ddp.py +++ b/tests/accelerators/test_ddp.py @@ -109,7 +109,6 @@ class TestModel(BoringModel): def setup(self, stage: Optional[str] = None) -> None: assert torch.distributed.is_initialized() - raise SystemExit() model = TestModel() trainer = Trainer( @@ -118,8 +117,7 @@ def setup(self, stage: Optional[str] = None) -> None: accelerator="ddp", gpus=1, ) - with pytest.raises(SystemExit): - trainer.fit(model) + trainer.fit(model) @RunIf(min_gpus=2, min_torch="1.8.1", special=True) From e5602c92942caf6d54b9e0b3ce7f2fd90783950a Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 22 Jun 2021 19:10:42 +0200 Subject: [PATCH 02/31] Update CHANGELOG --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 176413cd55e76..d2cd3926381e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -285,6 +285,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Support manual optimization with DeepSpeed ([#7970](https://github.com/PyTorchLightning/pytorch-lightning/pull/7970)) +- Destroy the distributed process group on DDP teardown ([#8080](https://github.com/PyTorchLightning/pytorch-lightning/pull/8080)) + + - Fixed `dataloader_idx` argument value when predicting with only one `DataLoader` ([#7941](https://github.com/PyTorchLightning/pytorch-lightning/pull/7941)) From 0b94b6c269cd04c3ec495a0beebd58bcda949b29 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 23 Jun 2021 14:12:08 +0200 Subject: [PATCH 03/31] Use destructor --- pytorch_lightning/plugins/training_type/ddp.py | 3 +-- tests/accelerators/test_ddp.py | 4 +++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 1ebed7c55405b..2ea19fb0c781b 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -374,7 +374,6 @@ def register_plugins(cls, plugin_registry: Dict) -> None: find_unused_parameters=False ) - def teardown(self) -> None: + def __del__(self) -> None: if torch_distrib.is_initialized(): torch_distrib.destroy_process_group() - super().teardown() diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py index 9f6b160567a84..3f335964a5eee 100644 --- a/tests/accelerators/test_ddp.py +++ b/tests/accelerators/test_ddp.py @@ -109,6 +109,7 @@ class TestModel(BoringModel): def setup(self, stage: Optional[str] = None) -> None: assert torch.distributed.is_initialized() + raise SystemExit() model = TestModel() trainer = Trainer( @@ -117,7 +118,8 @@ def setup(self, stage: Optional[str] = None) -> None: accelerator="ddp", gpus=1, ) - trainer.fit(model) + with pytest.raises(SystemExit): + trainer.fit(model) @RunIf(min_gpus=2, min_torch="1.8.1", special=True) From aaf32abde54cfb1bf205ae8bea878e9ebe282ad3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 23 Jun 2021 14:14:54 +0200 Subject: [PATCH 04/31] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2cd3926381e3..1c6633fe88508 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -285,7 +285,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Support manual optimization with DeepSpeed ([#7970](https://github.com/PyTorchLightning/pytorch-lightning/pull/7970)) -- Destroy the distributed process group on DDP teardown ([#8080](https://github.com/PyTorchLightning/pytorch-lightning/pull/8080)) +- Destroy the distributed process group on DDP destructor ([#8080](https://github.com/PyTorchLightning/pytorch-lightning/pull/8080)) - Fixed `dataloader_idx` argument value when predicting with only one `DataLoader` ([#7941](https://github.com/PyTorchLightning/pytorch-lightning/pull/7941)) From 0444d541c67e4f236c2de084924bbf4fef36c5e9 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 23 Jun 2021 16:28:22 +0200 Subject: [PATCH 05/31] RPC destructor --- pytorch_lightning/plugins/training_type/rpc.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py index 3e0f57daef001..d8698e71bd261 100644 --- a/pytorch_lightning/plugins/training_type/rpc.py +++ b/pytorch_lightning/plugins/training_type/rpc.py @@ -83,3 +83,7 @@ def exit_rpc_process(self): @property def rpc_enabled(self) -> bool: return True + + def __del__(self): + self.exit_rpc_process() + super().__del__() From 5d4f811cd865ee8952b7f388c487af671e919bbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 23 Jun 2021 16:28:38 +0200 Subject: [PATCH 06/31] Update pytorch_lightning/plugins/training_type/ddp.py --- pytorch_lightning/plugins/training_type/ddp.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 2ea19fb0c781b..11601dec1fc6c 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -377,3 +377,6 @@ def register_plugins(cls, plugin_registry: Dict) -> None: def __del__(self) -> None: if torch_distrib.is_initialized(): torch_distrib.destroy_process_group() + # clean up memory + with torch.cuda.device(self.root_device): + torch.cuda.empty_cache() From bf8766d392c021a4b20b4641ca9ee83bb042386c Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 23 Jun 2021 16:56:20 +0200 Subject: [PATCH 07/31] Why do you not work :( --- pytorch_lightning/plugins/training_type/rpc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py index d8698e71bd261..f825732f7e316 100644 --- a/pytorch_lightning/plugins/training_type/rpc.py +++ b/pytorch_lightning/plugins/training_type/rpc.py @@ -85,5 +85,5 @@ def rpc_enabled(self) -> bool: return True def __del__(self): - self.exit_rpc_process() - super().__del__() + # avoid hang + ... From 48bcb7ed2ea7ac463e0dc39e5808416c56957404 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 23 Jun 2021 18:05:16 +0200 Subject: [PATCH 08/31] Missing condition --- pytorch_lightning/plugins/training_type/ddp.py | 7 ++++--- pytorch_lightning/plugins/training_type/rpc.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 11601dec1fc6c..88fe86420069e 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -377,6 +377,7 @@ def register_plugins(cls, plugin_registry: Dict) -> None: def __del__(self) -> None: if torch_distrib.is_initialized(): torch_distrib.destroy_process_group() - # clean up memory - with torch.cuda.device(self.root_device): - torch.cuda.empty_cache() + if self.on_gpu: + # clean up memory + with torch.cuda.device(self.root_device): + torch.cuda.empty_cache() diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py index f825732f7e316..f20ece7ebbcf7 100644 --- a/pytorch_lightning/plugins/training_type/rpc.py +++ b/pytorch_lightning/plugins/training_type/rpc.py @@ -86,4 +86,4 @@ def rpc_enabled(self) -> bool: def __del__(self): # avoid hang - ... + pass From 21ad2d8234e053e08d97efd74a261fa76d6e8b56 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 04:46:44 +0200 Subject: [PATCH 09/31] Fix deepspeed test --- tests/plugins/test_deepspeed_plugin.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index c5eaadd1e5985..2e96ced4c0c26 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -1,3 +1,4 @@ +import gc import json import os from typing import Any, Dict @@ -265,6 +266,10 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args (RandomIterableDataset, "auto"), (RandomIterableDataset, 10)]) def test_deepspeed_auto_batch_size_config_select(tmpdir, dataset_cls, value): """Test to ensure that the batch size is correctly set as expected for deepspeed logging purposes.""" + # the previous parametrization can impact the current one as it's not guaranteed that resources will be released + # between parametrizations. This is important as we call `destroy_process_group` in `DDPPlugin.__del__`. + # Another option would be to not use `parametrize`: https://github.com/pytest-dev/pytest/discussions/8153 + gc.collect() class TestModel(BoringModel): From bbc489e313dccc95018d2f750a89f00c656a43d2 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 04:57:54 +0200 Subject: [PATCH 10/31] GC collect in conftest --- tests/conftest.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 7f6407ecfd82b..6cbdc3c3783c2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import gc import os import sys import threading @@ -36,6 +37,9 @@ def restore_env_variables(): """ Ensures that environment variables set during the test do not leak out. """ env_backup = os.environ.copy() yield + # if a destructor accesses an environment variable, we need to make sure that `os.environ` is not cleared + # before `__del__` is called. Force the call by triggering garbage collection. + gc.collect() # restore environment as it was before running the test os.environ.clear() os.environ.update(env_backup) From 5b06fd2c2528e29929e56fcc765ad708a8a77586 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 05:43:43 +0200 Subject: [PATCH 11/31] Do not show warnings for special tests --- tests/special_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/special_tests.sh b/tests/special_tests.sh index 9fca3b62bad40..a87f50548d06b 100755 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -17,7 +17,7 @@ set -e # this environment variable allows special tests to run export PL_RUNNING_SPECIAL_TESTS=1 # python arguments -defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no' +defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no --disable-warnings' # find tests marked as `@RunIf(special=True)` grep_output=$(grep --recursive --line-number --word-regexp 'tests' 'benchmarks' --regexp 'special=True') From 5e69ed84f9b09bcd25fe70909dc94470287982c6 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 05:44:05 +0200 Subject: [PATCH 12/31] Needs to run on 1.8 To avoid: "RuntimeError: NCCL error in: /pytorch/torch/lib/c10d/ProcessGroupNCCL.cpp:32, unhandled cuda error, NCCL version 2.4.8" --- .azure-pipelines/gpu-tests.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 5333bfd867da0..bc7120bbc2ae6 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -32,12 +32,9 @@ jobs: # python.version: '3.7' # ToDo: this need to have installed docker in the base image... - #container: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6 - #container: "pytorchlightning/pytorch_lightning:base-cuda-py$[ variables['python.version'] ]-torch1.6" container: # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.8-torch1.6" - #endpoint: azureContainerRegistryConnection + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all" workspace: From aed51a2c09213267283455efefb50f111c33384b Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 15:45:22 +0200 Subject: [PATCH 13/31] Run torch 1.8 --- .azure-pipelines/gpu-tests.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 5333bfd867da0..bc7120bbc2ae6 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -32,12 +32,9 @@ jobs: # python.version: '3.7' # ToDo: this need to have installed docker in the base image... - #container: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6 - #container: "pytorchlightning/pytorch_lightning:base-cuda-py$[ variables['python.version'] ]-torch1.6" container: # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.8-torch1.6" - #endpoint: azureContainerRegistryConnection + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all" workspace: From e0a3e8785d2fecd63667da433a648f958d60ef89 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 16:01:33 +0200 Subject: [PATCH 14/31] Skip test due to 'Python bus error' --- tests/helpers/test_models.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/helpers/test_models.py b/tests/helpers/test_models.py index e4bb7e7df0827..61b33265d1458 100644 --- a/tests/helpers/test_models.py +++ b/tests/helpers/test_models.py @@ -23,11 +23,12 @@ @pytest.mark.parametrize( - "data_class,model_class", [ + "data_class,model_class", + [ (None, BoringModel), (None, BasicGAN), (None, ParityModuleRNN), - (None, ParityModuleMNIST), + # (None, ParityModuleMNIST), (ClassifDataModule, ClassificationModel), (RegressDataModule, RegressionModel), ] From 9ee2d193832d022dd95096e932476dedcbd990d4 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 16:34:26 +0200 Subject: [PATCH 15/31] Debug NCCL --- .azure-pipelines/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index bc7120bbc2ae6..f1b57f9233ae3 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -71,7 +71,7 @@ jobs: displayName: 'Get legacy checkpoints' - bash: | - python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 + NCCL_DEBUG=INFO python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 displayName: 'Testing: standard' - bash: | From 3588aaa37723db12ee17969a80e4c90028c071ba Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 17:06:20 +0200 Subject: [PATCH 16/31] shm size --- .azure-pipelines/gpu-tests.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index f1b57f9233ae3..421ad96688d5a 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -25,17 +25,11 @@ jobs: pool: gridai-spot-pool - #strategy: - # matrix: - # PT16: - # torch.version: '1.6' - # python.version: '3.7' - # ToDo: this need to have installed docker in the base image... container: # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" - options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all" + options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m" workspace: clean: all From 067bf1ae9eee271aaf3c4e4ac6bf9a50ba807fa2 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 17:28:56 +0200 Subject: [PATCH 17/31] Disable warnings for special tests --- tests/special_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/special_tests.sh b/tests/special_tests.sh index 9fca3b62bad40..a87f50548d06b 100755 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -17,7 +17,7 @@ set -e # this environment variable allows special tests to run export PL_RUNNING_SPECIAL_TESTS=1 # python arguments -defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no' +defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no --disable-warnings' # find tests marked as `@RunIf(special=True)` grep_output=$(grep --recursive --line-number --word-regexp 'tests' 'benchmarks' --regexp 'special=True') From 6060b05215f0b824944bcabb2d7a4f3440625a96 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 17:29:25 +0200 Subject: [PATCH 18/31] Remove NCCL_DEBUG statement --- .azure-pipelines/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 421ad96688d5a..5499202bc690e 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -65,7 +65,7 @@ jobs: displayName: 'Get legacy checkpoints' - bash: | - NCCL_DEBUG=INFO python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 + python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 displayName: 'Testing: standard' - bash: | From f0fa1b74d0790a397702305a8cdd93ad7bcf18b7 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 17:30:06 +0200 Subject: [PATCH 19/31] Try smaller shm size --- .azure-pipelines/gpu-tests.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 5499202bc690e..b1fedd578bc85 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -28,8 +28,11 @@ jobs: # ToDo: this need to have installed docker in the base image... container: # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 + # run on torch 1.8 as it's the LTS version image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" - options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m" + # default shm size is 64m. Increase it to avoid: + # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' + options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=256m" workspace: clean: all From 6dd70381ce88f8ac3459de4b9795a875d596c9f5 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 17:31:05 +0200 Subject: [PATCH 20/31] Revert "Skip test due to 'Python bus error'" This reverts commit e0a3e8785d2fecd63667da433a648f958d60ef89. --- tests/helpers/test_models.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/helpers/test_models.py b/tests/helpers/test_models.py index 61b33265d1458..e4bb7e7df0827 100644 --- a/tests/helpers/test_models.py +++ b/tests/helpers/test_models.py @@ -23,12 +23,11 @@ @pytest.mark.parametrize( - "data_class,model_class", - [ + "data_class,model_class", [ (None, BoringModel), (None, BasicGAN), (None, ParityModuleRNN), - # (None, ParityModuleMNIST), + (None, ParityModuleMNIST), (ClassifDataModule, ClassificationModel), (RegressDataModule, RegressionModel), ] From 73e62f8aba385a3cad540c438fb500a46ded9648 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 18:15:47 +0200 Subject: [PATCH 21/31] README and adjust versions --- README.md | 4 ++-- requirements/adjust_versions.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 7a540adadd327..78175f95c28fd 100644 --- a/README.md +++ b/README.md @@ -74,10 +74,10 @@ Lightning is rigorously tested across multiple GPUs, TPUs CPUs and against major
- | System / PyTorch ver. | 1.4 (min. req.) | 1.5 | 1.6 | 1.7 | 1.8 (latest) | 1.9 (nightly) | + | System / PyTorch ver. | 1.4 (min. req.) | 1.5 | 1.6 | 1.7 | 1.8 (LTS) | 1.9 (latest) | | :---: | :---: | :---: | :---: | :---: | :---: | :---: | | Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | - | Linux py3.7 [GPUs**] | - | - | [![Build Status](https://dev.azure.com/PytorchLightning/pytorch-lightning/_apis/build/status/PL.pytorch-lightning%20(GPUs)?branchName=master)](https://dev.azure.com/PytorchLightning/pytorch-lightning/_build/latest?definitionId=6&branchName=master) | - | - | - | + | Linux py3.7 [GPUs**] | - | - | - | - | [![Build Status](https://dev.azure.com/PytorchLightning/pytorch-lightning/_apis/build/status/PL.pytorch-lightning%20(GPUs)?branchName=master)](https://dev.azure.com/PytorchLightning/pytorch-lightning/_build/latest?definitionId=6&branchName=master) | - | | Linux py3.{6,7} [TPUs***] | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | | Linux py3.{6,7,8,9} | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | | OSX py3.{6,7,8,9} | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | diff --git a/requirements/adjust_versions.py b/requirements/adjust_versions.py index a09128c6200db..84879b4e48a34 100644 --- a/requirements/adjust_versions.py +++ b/requirements/adjust_versions.py @@ -4,7 +4,8 @@ from typing import Dict, Optional VERSIONS = [ - dict(torch="1.9.0", torchvision="", torchtext=""), # nightly + dict(torch="1.10.0", torchvision="", torchtext=""), # nightly + dict(torch="1.9.0", torchvision="0.10.0", torchtext="0.10.0"), dict(torch="1.8.1", torchvision="0.9.1", torchtext="0.9.1"), dict(torch="1.8.0", torchvision="0.9.0", torchtext="0.9.0"), dict(torch="1.7.1", torchvision="0.8.2", torchtext="0.8.1"), From 902ef02b95fee49275b60a04ac8dbe9d6f682933 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 18:22:21 +0200 Subject: [PATCH 22/31] Avoid self.on_gpu call --- pytorch_lightning/plugins/training_type/ddp.py | 6 ++---- tests/conftest.py | 4 ---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index dcc78f7bc5d40..c04a4ab111a20 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -384,7 +384,5 @@ def register_plugins(cls, plugin_registry: Dict) -> None: def __del__(self) -> None: if torch_distrib.is_initialized(): torch_distrib.destroy_process_group() - if self.on_gpu: - # clean up memory - with torch.cuda.device(self.root_device): - torch.cuda.empty_cache() + # `is_initialized` is checked inside and we already set the default device with `set_device(self.root_device)` + torch.cuda.empty_cache() diff --git a/tests/conftest.py b/tests/conftest.py index 6cbdc3c3783c2..7f6407ecfd82b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import gc import os import sys import threading @@ -37,9 +36,6 @@ def restore_env_variables(): """ Ensures that environment variables set during the test do not leak out. """ env_backup = os.environ.copy() yield - # if a destructor accesses an environment variable, we need to make sure that `os.environ` is not cleared - # before `__del__` is called. Force the call by triggering garbage collection. - gc.collect() # restore environment as it was before running the test os.environ.clear() os.environ.update(env_backup) From 4ce0f9a1feaa8e85e536f5dead658d17c65611c8 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 18:41:53 +0200 Subject: [PATCH 23/31] empty cache cleanup --- pytorch_lightning/accelerators/gpu.py | 5 +---- .../plugins/training_type/parallel.py | 3 +-- .../plugins/training_type/single_device.py | 3 +-- .../trainer/connectors/checkpoint_connector.py | 14 +++----------- pytorch_lightning/utilities/memory.py | 15 +++++++-------- 5 files changed, 13 insertions(+), 27 deletions(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 7543a2b794b5d..1c5ff56d805a6 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -42,10 +42,7 @@ def setup(self, trainer: 'pl.Trainer', model: 'pl.LightningModule') -> None: def on_train_start(self) -> None: # clear cache before training - # use context because of: - # https://discuss.pytorch.org/t/out-of-memory-when-i-use-torch-cuda-empty-cache/57898 - with torch.cuda.device(self.root_device): - torch.cuda.empty_cache() + torch.cuda.empty_cache() @staticmethod def set_nvidia_flags(local_rank: int) -> None: diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py index 09e48a760e868..122a1423c2817 100644 --- a/pytorch_lightning/plugins/training_type/parallel.py +++ b/pytorch_lightning/plugins/training_type/parallel.py @@ -132,5 +132,4 @@ def teardown(self) -> None: # GPU teardown self.lightning_module.cpu() # clean up memory - with torch.cuda.device(self.root_device): - torch.cuda.empty_cache() + torch.cuda.empty_cache() diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py index 1816f5838c948..d4a328902eba0 100644 --- a/pytorch_lightning/plugins/training_type/single_device.py +++ b/pytorch_lightning/plugins/training_type/single_device.py @@ -85,5 +85,4 @@ def teardown(self) -> None: # GPU teardown self.lightning_module.cpu() # clean up memory - with torch.cuda.device(self.root_device): - torch.cuda.empty_cache() + torch.cuda.empty_cache() diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index c2a0411c0df36..0bc3145a99e59 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -21,13 +21,7 @@ import pytorch_lightning from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.utilities import ( - _OMEGACONF_AVAILABLE, - DeviceType, - rank_zero_deprecation, - rank_zero_info, - rank_zero_warn, -) +from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, rank_zero_deprecation, rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.upgrade_checkpoint import KEYS_MAPPING as DEPRECATED_CHECKPOINT_KEYS @@ -69,8 +63,7 @@ def resume_start(self) -> None: return # clear cache before restore - if self.trainer._device_type == DeviceType.GPU: - torch.cuda.empty_cache() + torch.cuda.empty_cache() # Try to read the checkpoint file at `checkpoint_path`. If not exist, do not restore checkpoint. fs = get_filesystem(checkpoint_path) @@ -88,8 +81,7 @@ def resume_end(self) -> None: self._loaded_checkpoint = dict() # clear cache after restore - if self.trainer._device_type == DeviceType.GPU: - torch.cuda.empty_cache() + torch.cuda.empty_cache() # wait for all to catch up self.trainer.training_type_plugin.barrier("CheckpointConnector.resume_end") diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index 6c01390a8c81e..0ae88e8995614 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -76,11 +76,10 @@ def is_out_of_cpu_memory(exception): def garbage_collection_cuda(): """Garbage collection Torch (CUDA) memory.""" gc.collect() - if torch.cuda.is_available(): - try: - # This is the last thing that should cause an OOM error, but seemingly it can. - torch.cuda.empty_cache() - except RuntimeError as exception: - if not is_oom_error(exception): - # Only handle OOM errors - raise + try: + # This is the last thing that should cause an OOM error, but seemingly it can. + torch.cuda.empty_cache() + except RuntimeError as exception: + if not is_oom_error(exception): + # Only handle OOM errors + raise From 738daa5bbccd5e00f65ea5f4c8b9218fea15839d Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 19:06:25 +0200 Subject: [PATCH 24/31] More garbage collection --- tests/plugins/test_deepspeed_plugin.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 2e96ced4c0c26..b609bc78d74fc 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -650,6 +650,8 @@ def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_opt """ Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works. """ + gc.collect() + seed_everything(42) class VerificationCallback(Callback): From 236aa97bf35af324a43ca4f729f7ebdecff5fa3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 24 Jun 2021 21:23:50 +0200 Subject: [PATCH 25/31] Unroll parametrizations --- tests/callbacks/test_pruning.py | 41 ++++++++++++++++--- .../test_checkpoint_callback_frequency.py | 14 +++++-- tests/plugins/test_deepspeed_plugin.py | 16 +++++--- 3 files changed, 58 insertions(+), 13 deletions(-) diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py index f198b29d24e84..1a5ddad64106e 100644 --- a/tests/callbacks/test_pruning.py +++ b/tests/callbacks/test_pruning.py @@ -162,13 +162,44 @@ def test_pruning_callback( @RunIf(special=True, min_gpus=2) -@pytest.mark.parametrize("parameters_to_prune", [False, True]) -@pytest.mark.parametrize("use_global_unstructured", [False, True]) -def test_pruning_callback_ddp(tmpdir, use_global_unstructured: bool, parameters_to_prune: bool): +def test_pruning_callback_ddp_0(tmpdir): train_with_pruning_callback( tmpdir, - parameters_to_prune=parameters_to_prune, - use_global_unstructured=use_global_unstructured, + parameters_to_prune=False, + use_global_unstructured=False, + accelerator="ddp", + gpus=2, + ) + + +@RunIf(special=True, min_gpus=2) +def test_pruning_callback_ddp_1(tmpdir): + train_with_pruning_callback( + tmpdir, + parameters_to_prune=False, + use_global_unstructured=True, + accelerator="ddp", + gpus=2, + ) + + +@RunIf(special=True, min_gpus=2) +def test_pruning_callback_ddp_2(tmpdir): + train_with_pruning_callback( + tmpdir, + parameters_to_prune=True, + use_global_unstructured=False, + accelerator="ddp", + gpus=2, + ) + + +@RunIf(special=True, min_gpus=2) +def test_pruning_callback_ddp_3(tmpdir): + train_with_pruning_callback( + tmpdir, + parameters_to_prune=True, + use_global_unstructured=True, accelerator="ddp", gpus=2, ) diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 9fdd69dba7a9a..c5afecc2b4bf3 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -105,10 +105,18 @@ def training_step(self, batch, batch_idx): assert save_mock.call_count == expected -@mock.patch('torch.save') @RunIf(special=True, min_gpus=2) -@pytest.mark.parametrize(['k', 'epochs', 'val_check_interval', 'expected'], [(1, 1, 1.0, 1), (2, 2, 0.3, 5)]) -def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): +def test_top_k_ddp_0(tmpdir): + _top_k_ddp(tmpdir, k=1, epochs=1, val_check_interval=1.0, expected=1) + + +@RunIf(special=True, min_gpus=2) +def test_top_k_ddp_1(tmpdir): + _top_k_ddp(tmpdir, k=2, epochs=2, val_check_interval=0.3, expected=5) + + +@mock.patch('torch.save') +def _top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): class TestModel(BoringModel): diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index b609bc78d74fc..b443827cac70c 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -644,14 +644,10 @@ def test_deepspeed_multigpu_stage_3_checkpointing_full_weights_manual(tmpdir): run_checkpoint_test(tmpdir, save_full_weights=True, automatic_optimization=False, accumulate_grad_batches=1) -@RunIf(min_gpus=2, deepspeed=True, special=True) -@pytest.mark.parametrize('offload_optimizer', [True, False]) -def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer): +def _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer): """ Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works. """ - gc.collect() - seed_everything(42) class VerificationCallback(Callback): @@ -678,6 +674,16 @@ def on_train_batch_start( trainer.fit(model, datamodule=dm) +@RunIf(min_gpus=2, deepspeed=True, special=True) +def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir): + _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=False) + + +@RunIf(min_gpus=2, deepspeed=True, special=True) +def test_deepspeed_multigpu_stage_2_accumulated_grad_batches_offload_optimizer(tmpdir): + _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=True) + + @RunIf(min_gpus=2, deepspeed=True, special=True) def test_deepspeed_multigpu_test(tmpdir, deepspeed_config): """ From ffa532d3bb888606438577f98e7ae512fa28a0cd Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 25 Jun 2021 00:30:21 +0200 Subject: [PATCH 26/31] Do not reuse mock --- .../test_checkpoint_callback_frequency.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index c5afecc2b4bf3..67db594aa2539 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -105,17 +105,18 @@ def training_step(self, batch, batch_idx): assert save_mock.call_count == expected +@mock.patch('torch.save') @RunIf(special=True, min_gpus=2) -def test_top_k_ddp_0(tmpdir): - _top_k_ddp(tmpdir, k=1, epochs=1, val_check_interval=1.0, expected=1) +def test_top_k_ddp_0(save_mock, tmpdir): + _top_k_ddp(save_mock, tmpdir, k=1, epochs=1, val_check_interval=1.0, expected=1) +@mock.patch('torch.save') @RunIf(special=True, min_gpus=2) -def test_top_k_ddp_1(tmpdir): - _top_k_ddp(tmpdir, k=2, epochs=2, val_check_interval=0.3, expected=5) +def test_top_k_ddp_1(save_mock, tmpdir): + _top_k_ddp(save_mock, tmpdir, k=2, epochs=2, val_check_interval=0.3, expected=5) -@mock.patch('torch.save') def _top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): class TestModel(BoringModel): From 7a123543ab23c70920f9528a413479690a5d898e Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 25 Jun 2021 02:16:28 +0200 Subject: [PATCH 27/31] Remove abbreviation --- pytorch_lightning/plugins/training_type/ddp.py | 18 ++++++++++-------- .../plugins/training_type/ddp_spawn.py | 10 ++++++---- .../plugins/training_type/horovod.py | 4 ++-- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index c04a4ab111a20..0335ca357b055 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -21,7 +21,7 @@ import __main__ import numpy as np import torch -import torch.distributed as torch_distrib +import torch.distributed from torch.nn.parallel.distributed import DistributedDataParallel from torch.optim import Optimizer @@ -234,7 +234,7 @@ def setup_distributed(self): self.init_ddp_connection() # on world_size=0 let everyone know training is starting - if self.is_global_zero and not torch_distrib.is_initialized(): + if self.is_global_zero and not torch.distributed.is_initialized(): log.info("-" * 100) log.info(f"distributed_backend={self.distributed_backend}") log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") @@ -304,9 +304,11 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt world_size = world_size if world_size is not None else self.cluster_environment.world_size() os.environ["MASTER_ADDR"] = self.cluster_environment.master_address() os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) - if not torch_distrib.is_initialized(): + if not torch.distributed.is_initialized(): log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") - torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size) + torch.distributed.init_process_group( + self.torch_distributed_backend, rank=global_rank, world_size=world_size + ) def pre_dispatch(self): # move the model to the correct device @@ -323,8 +325,8 @@ def post_dispatch(self) -> None: self.cluster_environment.teardown() def barrier(self, *args, **kwargs): - if torch_distrib.is_available() and torch_distrib.is_initialized(): - torch_distrib.barrier() + if torch.distributed.is_initialized(): + torch.distributed.barrier() def broadcast(self, obj: object, src: int = 0) -> object: return self.dist.broadcast(obj) @@ -382,7 +384,7 @@ def register_plugins(cls, plugin_registry: Dict) -> None: ) def __del__(self) -> None: - if torch_distrib.is_initialized(): - torch_distrib.destroy_process_group() + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() # `is_initialized` is checked inside and we already set the default device with `set_device(self.root_device)` torch.cuda.empty_cache() diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index 47f2a64c04759..d3a5acce112c1 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -17,7 +17,7 @@ from typing import Any, List, Optional, Union import torch -import torch.distributed as torch_distrib +import torch.distributed import torch.multiprocessing as mp from torch.nn.parallel.distributed import DistributedDataParallel from torch.optim import Optimizer @@ -265,7 +265,9 @@ def init_ddp_connection(self, global_rank: Optional[int], world_size: Optional[i if not torch.distributed.is_initialized(): log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") - torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size) + torch.distributed.init_process_group( + self.torch_distributed_backend, rank=global_rank, world_size=world_size + ) def determine_ddp_device_ids(self): if self.root_device.type == "cpu": @@ -306,8 +308,8 @@ def __recover_child_process_weights(self, best_path, last_path): self.lightning_module.load_state_dict(ckpt) def barrier(self, *args, **kwargs): - if torch_distrib.is_initialized(): - torch_distrib.barrier() + if torch.distributed.is_initialized(): + torch.distributed.barrier() def broadcast(self, obj: object, src: int = 0) -> object: return self.dist.broadcast(obj) diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py index 99899aed11753..cbd9e80dabc23 100644 --- a/pytorch_lightning/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -15,7 +15,7 @@ from typing import Any, List, Optional, Union import torch -import torch.distributed as torch_distrib +import torch.distributed from torch.optim.lr_scheduler import _LRScheduler, Optimizer from pytorch_lightning.core.optimizer import LightningOptimizer @@ -125,7 +125,7 @@ def start_predicting(self, trainer): self.join() def barrier(self, *args, **kwargs): - if torch_distrib.is_initialized(): + if torch.distributed.is_initialized(): self.join() def broadcast(self, obj: object, src: int = 0) -> object: From 74d8a7d5cc0f0043c4551f393dc2b4a17ce53625 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 29 Jun 2021 17:45:34 +0200 Subject: [PATCH 28/31] Has initialized ddp --- pytorch_lightning/plugins/training_type/ddp.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 89ae36227d884..8957967ec31a2 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -100,6 +100,7 @@ def __init__( self._ddp_comm_wrapper = ddp_comm_wrapper self._pids: Optional[List[int]] = None self._sync_dir: Optional[str] = None + self._has_initialized_ddp: bool = False self.set_world_ranks() @property @@ -310,7 +311,7 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt torch.distributed.init_process_group( self.torch_distributed_backend, rank=global_rank, world_size=world_size ) - + self._has_initialized_ddp = True # on rank=0 let everyone know training is starting rank_zero_info( f"{'-' * 100}\n" @@ -335,12 +336,12 @@ def post_dispatch(self) -> None: self.cluster_environment.teardown() def barrier(self, *args, **kwargs) -> None: - if not torch_distrib.is_initialized(): + if not torch.distributed.is_initialized(): return if _TORCH_GREATER_EQUAL_1_8 and torch.distributed.get_backend() == "nccl": - torch_distrib.barrier(device_ids=self.determine_ddp_device_ids()) + torch.distributed.barrier(device_ids=self.determine_ddp_device_ids()) else: - torch_distrib.barrier() + torch.distributed.barrier() def broadcast(self, obj: object, src: int = 0) -> object: return self.dist.broadcast(obj) @@ -436,7 +437,6 @@ def reconciliate_processes(self, trace: str): raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}") def __del__(self) -> None: - if torch.distributed.is_initialized(): + if torch.distributed.is_initialized() and self._has_initialized_ddp: torch.distributed.destroy_process_group() - # `is_initialized` is checked inside and we already set the default device with `set_device(self.root_device)` torch.cuda.empty_cache() From 3c2ac1063195b77e9e1dc91b9660022bbcd056f8 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 2 Jul 2021 16:08:11 +0200 Subject: [PATCH 29/31] Merge master --- pytorch_lightning/plugins/training_type/ddp.py | 2 +- tests/conftest.py | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index bc6fd76429185..f66c1ae00c15f 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -453,6 +453,6 @@ def reconciliate_processes(self, trace: str): raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}") def __del__(self) -> None: - if torch.distributed.is_initialized() and self._has_initialized_ddp: + if distributed_available() and self._has_initialized_ddp: torch.distributed.destroy_process_group() torch.cuda.empty_cache() diff --git a/tests/conftest.py b/tests/conftest.py index 3f767d8b6fad2..7f6407ecfd82b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,7 +18,6 @@ from http.server import SimpleHTTPRequestHandler import pytest -import torch.distributed import torch.multiprocessing as mp @@ -42,14 +41,6 @@ def restore_env_variables(): os.environ.update(env_backup) -@pytest.fixture(scope="function", autouse=True) -def teardown_process_group(): - """ Ensures that the distributed process group gets closed before the next test runs. """ - yield - if torch.distributed.is_available() and torch.distributed.is_initialized(): - torch.distributed.destroy_process_group() - - def pytest_configure(config): config.addinivalue_line("markers", "spawn: spawn test in a separate process using torch.multiprocessing.spawn") From 9ee6ee2d260a670ae2941231c6ba90c44a47b495 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 2 Jul 2021 16:09:02 +0200 Subject: [PATCH 30/31] Merge master --- tests/plugins/test_deepspeed_plugin.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 8f24c17bcb6a1..6c238ab747350 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -684,16 +684,6 @@ def on_train_batch_start( trainer.fit(model, datamodule=dm) -@RunIf(min_gpus=2, deepspeed=True, special=True) -def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir): - _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=False) - - -@RunIf(min_gpus=2, deepspeed=True, special=True) -def test_deepspeed_multigpu_stage_2_accumulated_grad_batches_offload_optimizer(tmpdir): - _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=True) - - @RunIf(min_gpus=2, deepspeed=True, special=True) def test_deepspeed_multigpu_test(tmpdir, deepspeed_config): """ From fc6338ea698ae33aa542835b79be10ef2d823939 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 2 Jul 2021 16:10:05 +0200 Subject: [PATCH 31/31] Unnecessary annotation --- pytorch_lightning/plugins/training_type/ddp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index f66c1ae00c15f..3afd5c7d4097a 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -106,7 +106,7 @@ def __init__( self._ddp_comm_wrapper = ddp_comm_wrapper self._pids: Optional[List[int]] = None self._sync_dir: Optional[str] = None - self._has_initialized_ddp: bool = False + self._has_initialized_ddp = False self.set_world_ranks() @property