From 655969f09a889ed48b979f535bea570f2efd28f7 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 18 Feb 2021 23:37:35 +0000 Subject: [PATCH 01/17] Enable ZeRO optimization, and make sure that the lightning module hook is called when we move to half precision --- pytorch_lightning/plugins/training_type/deepspeed.py | 3 +++ tests/plugins/test_deepspeed_plugin.py | 10 +++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 0f9a8378052a5..74e06d35f4318 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -49,6 +49,9 @@ def forward(self, *inputs, **kwargs): return super().forward(*inputs, **kwargs) + def half(self): + self.module.half() + @staticmethod def batch_to(data): return data.half() diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 9c9c5c097b4c5..00302eddecfa9 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -182,7 +182,7 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args trainer = Trainer( fast_dev_run=True, default_root_dir=tmpdir, - plugins=DeepSpeedPlugin(zero_optimization=False), + plugins=DeepSpeedPlugin(), gpus=1, ) with pytest.warns(UserWarning, match='Overridden backward hook in the LightningModule will be ignored'): @@ -210,7 +210,7 @@ def on_train_start(self) -> None: model = TestModel() trainer = Trainer( - plugins=DeepSpeedPlugin(zero_optimization=False), + plugins=DeepSpeedPlugin(), default_root_dir=tmpdir, gpus=1, fast_dev_run=True, @@ -267,7 +267,7 @@ def test_deepspeed_multigpu(tmpdir, deepspeed_config): """ model = BoringModel() trainer = Trainer( - plugins=[DeepSpeedPlugin(zero_optimization=False)], + plugins=[DeepSpeedPlugin()], default_root_dir=tmpdir, gpus=2, fast_dev_run=True, @@ -285,8 +285,8 @@ def _assert_save_model_is_equal(model, tmpdir, trainer): # carry out the check only on rank 0 if trainer.global_rank == 0: saved_model = BoringModel.load_from_checkpoint(checkpoint_path) - saved_model = saved_model.float() - model = model.float().cpu() + saved_model = saved_model.half() # model is loaded in float32 as default, move it to float16 + model = model.cpu() # Assert model parameters are identical after loading for orig_param, trained_model_param in zip(model.parameters(), saved_model.parameters()): assert torch.equal(orig_param, trained_model_param) From 75a54e2d7d9d8451b69075278d5bf5e3b350e0af Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 18 Feb 2021 23:52:47 +0000 Subject: [PATCH 02/17] Added test, update to function --- .../plugins/training_type/deepspeed.py | 3 +++ tests/plugins/test_deepspeed_plugin.py | 21 ++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 74e06d35f4318..5c5edfcf3c793 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -52,6 +52,9 @@ def forward(self, *inputs, **kwargs): def half(self): self.module.half() + def to(self, *args, **kwargs): + self.module.to(*args, **kwargs) + @staticmethod def batch_to(data): return data.half() diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 00302eddecfa9..87709c62925aa 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -1,5 +1,6 @@ import json import os +from unittest.mock import patch import pytest import torch @@ -8,11 +9,28 @@ from pytorch_lightning import Trainer from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin +from pytorch_lightning.plugins.training_type.deepspeed import LightningDeepSpeedModule from pytorch_lightning.utilities import _APEX_AVAILABLE, _DEEPSPEED_AVAILABLE, _NATIVE_AMP_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel +@patch.object(BoringModel, 'to') +def test_deepspeed_wrapper(mocked_to, tmpdir): + """ + Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves types and device correctly. + """ + + model = BoringModel() + module = LightningDeepSpeedModule(model, precision=16) + + module.half() + assert model.dtype == torch.half + + module.to('cuda') + assert mocked_to.called, "LightningDeepSpeedModule did not call LightningModule `to` hook when transferring device" + + @pytest.fixture def deepspeed_config(): return { @@ -285,7 +303,8 @@ def _assert_save_model_is_equal(model, tmpdir, trainer): # carry out the check only on rank 0 if trainer.global_rank == 0: saved_model = BoringModel.load_from_checkpoint(checkpoint_path) - saved_model = saved_model.half() # model is loaded in float32 as default, move it to float16 + if model.dtype == torch.half: + saved_model = saved_model.half() # model is loaded in float32 as default, move it to float16 model = model.cpu() # Assert model parameters are identical after loading for orig_param, trained_model_param in zip(model.parameters(), saved_model.parameters()): From c5413ab742ebf363a02f77c839e403f3f7f77a96 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 19 Feb 2021 10:17:55 +0000 Subject: [PATCH 03/17] Use device type mixin --- pytorch_lightning/plugins/training_type/deepspeed.py | 12 +++++------- tests/plugins/test_deepspeed_plugin.py | 9 +++++---- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 5c5edfcf3c793..f6b3e7e392172 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -29,6 +29,7 @@ from pytorch_lightning.trainer.optimizers import _get_default_scheduler_config from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.apply_func import apply_to_collection +from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_only from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _DEEPSPEED_AVAILABLE @@ -37,7 +38,7 @@ import deepspeed -class LightningDeepSpeedModule(_LightningModuleWrapperBase): +class LightningDeepSpeedModule(_LightningModuleWrapperBase, DeviceDtypeModuleMixin): def __init__(self, pl_module: LightningModule, precision: int): super().__init__(pl_module) @@ -49,12 +50,6 @@ def forward(self, *inputs, **kwargs): return super().forward(*inputs, **kwargs) - def half(self): - self.module.half() - - def to(self, *args, **kwargs): - self.module.to(*args, **kwargs) - @staticmethod def batch_to(data): return data.half() @@ -63,6 +58,9 @@ def _move_float_tensors_to_half(self, batch: Any): batch = apply_to_collection(batch, (torch.FloatTensor, torch.cuda.FloatTensor), function=self.batch_to) return batch + def on_post_move_to_device(self): + pass + class DeepSpeedPlugin(DDPPlugin): distributed_backend = "deepspeed" diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 87709c62925aa..c5bb5c4f76574 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -15,8 +15,7 @@ from tests.helpers.boring_model import BoringModel -@patch.object(BoringModel, 'to') -def test_deepspeed_wrapper(mocked_to, tmpdir): +def test_lightning_module_base_wrapper(tmpdir): """ Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves types and device correctly. """ @@ -25,10 +24,12 @@ def test_deepspeed_wrapper(mocked_to, tmpdir): module = LightningDeepSpeedModule(model, precision=16) module.half() + assert module.dtype == torch.half assert model.dtype == torch.half - module.to('cuda') - assert mocked_to.called, "LightningDeepSpeedModule did not call LightningModule `to` hook when transferring device" + module.to(torch.double) + assert module.dtype == torch.double + assert model.dtype == torch.double @pytest.fixture From 1c1c114824bf776fc9858a3b63962b234efd905a Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 19 Feb 2021 11:04:25 +0000 Subject: [PATCH 04/17] Add precision --- tests/plugins/test_deepspeed_plugin.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index c5bb5c4f76574..995a53bcc1b9b 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -198,12 +198,7 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args return loss.backward() model = TestModel() - trainer = Trainer( - fast_dev_run=True, - default_root_dir=tmpdir, - plugins=DeepSpeedPlugin(), - gpus=1, - ) + trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir, plugins=DeepSpeedPlugin(), gpus=1, precision=16) with pytest.warns(UserWarning, match='Overridden backward hook in the LightningModule will be ignored'): trainer.fit(model) @@ -228,12 +223,7 @@ def on_train_start(self) -> None: assert isinstance(self.trainer.model.lr_scheduler, torch.optim.lr_scheduler.StepLR) model = TestModel() - trainer = Trainer( - plugins=DeepSpeedPlugin(), - default_root_dir=tmpdir, - gpus=1, - fast_dev_run=True, - ) + trainer = Trainer(plugins=DeepSpeedPlugin(), default_root_dir=tmpdir, gpus=1, fast_dev_run=True, precision=16) trainer.fit(model) @@ -266,6 +256,7 @@ def on_train_start(self) -> None: default_root_dir=tmpdir, gpus=1, fast_dev_run=True, + precision=16 ) trainer.fit(model) From 1d1a1e1c3e80218ebc4c8a2f48c5986ce055ef66 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 19 Feb 2021 12:13:25 +0000 Subject: [PATCH 05/17] Turn off zero for checking optimizers are correct --- tests/plugins/test_deepspeed_plugin.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 995a53bcc1b9b..42bdc503cd52e 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -223,7 +223,12 @@ def on_train_start(self) -> None: assert isinstance(self.trainer.model.lr_scheduler, torch.optim.lr_scheduler.StepLR) model = TestModel() - trainer = Trainer(plugins=DeepSpeedPlugin(), default_root_dir=tmpdir, gpus=1, fast_dev_run=True, precision=16) + trainer = Trainer( + plugins=DeepSpeedPlugin(zero_optimization=False), # disable ZeRO so our optimizers are not wrapped + default_root_dir=tmpdir, + gpus=1, + fast_dev_run=True + ) trainer.fit(model) From d3fcc09481297b3f59a5a64138ab4d39ae351a90 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 19 Feb 2021 12:25:03 +0000 Subject: [PATCH 06/17] Remove import --- tests/plugins/test_deepspeed_plugin.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 42bdc503cd52e..eade4e9864fae 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -1,6 +1,5 @@ import json import os -from unittest.mock import patch import pytest import torch From f5d25fd084110c15588a1d8b1f5d39de5630ed46 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 19 Feb 2021 13:24:46 +0000 Subject: [PATCH 07/17] Use FP16 Wrapper --- tests/plugins/test_deepspeed_plugin.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index eade4e9864fae..54517d2b55729 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -216,17 +216,21 @@ def test_deepspeed_run_configure_optimizers(tmpdir): class TestModel(BoringModel): def on_train_start(self) -> None: - assert isinstance(self.trainer.optimizers[0], torch.optim.SGD) + from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer + + assert isinstance(self.trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer) + assert isinstance(self.trainer.optimizers[0].optimizer, torch.optim.SGD) assert self.trainer.lr_schedulers == [] # DeepSpeed manages LR scheduler internally # Ensure DeepSpeed engine has initialized with our optimizer/lr_scheduler assert isinstance(self.trainer.model.lr_scheduler, torch.optim.lr_scheduler.StepLR) model = TestModel() trainer = Trainer( - plugins=DeepSpeedPlugin(zero_optimization=False), # disable ZeRO so our optimizers are not wrapped + plugins=DeepSpeedPlugin(), # disable ZeRO so our optimizers are not wrapped default_root_dir=tmpdir, gpus=1, - fast_dev_run=True + fast_dev_run=True, + precision=16 ) trainer.fit(model) From cf6bd945fbd6478ec91f9b6a6d710be3e31df3b0 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 19 Feb 2021 14:33:08 +0000 Subject: [PATCH 08/17] Move mixin to the base class --- pytorch_lightning/overrides/base.py | 6 +++++- pytorch_lightning/plugins/training_type/deepspeed.py | 6 +----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/overrides/base.py b/pytorch_lightning/overrides/base.py index 2fcb4b11a0b7f..c0b691bb07cb8 100644 --- a/pytorch_lightning/overrides/base.py +++ b/pytorch_lightning/overrides/base.py @@ -19,12 +19,13 @@ from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.trainer.states import RunningStage +from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.utilities.warnings import WarningCache warning_cache = WarningCache() -class _LightningModuleWrapperBase(torch.nn.Module): +class _LightningModuleWrapperBase(DeviceDtypeModuleMixin, torch.nn.Module): def __init__(self, pl_module: LightningModule): """ @@ -72,6 +73,9 @@ def forward(self, *inputs, **kwargs): return output + def on_post_move_to_device(self): + pass + def warn_if_output_is_none(output: Any, method_name: str) -> None: """ Warns user about which method returned None. """ diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index f6b3e7e392172..0f9a8378052a5 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -29,7 +29,6 @@ from pytorch_lightning.trainer.optimizers import _get_default_scheduler_config from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.apply_func import apply_to_collection -from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_only from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _DEEPSPEED_AVAILABLE @@ -38,7 +37,7 @@ import deepspeed -class LightningDeepSpeedModule(_LightningModuleWrapperBase, DeviceDtypeModuleMixin): +class LightningDeepSpeedModule(_LightningModuleWrapperBase): def __init__(self, pl_module: LightningModule, precision: int): super().__init__(pl_module) @@ -58,9 +57,6 @@ def _move_float_tensors_to_half(self, batch: Any): batch = apply_to_collection(batch, (torch.FloatTensor, torch.cuda.FloatTensor), function=self.batch_to) return batch - def on_post_move_to_device(self): - pass - class DeepSpeedPlugin(DDPPlugin): distributed_backend = "deepspeed" From 8969d179b5bd237d299bd078b1d5d58bddee19fd Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 19 Feb 2021 14:39:37 +0000 Subject: [PATCH 09/17] Better name for the test, test precision move --- tests/plugins/test_deepspeed_plugin.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 54517d2b55729..55825d311030e 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -14,7 +14,7 @@ from tests.helpers.boring_model import BoringModel -def test_lightning_module_base_wrapper(tmpdir): +def test_deepspeed_lightning_module(tmpdir): """ Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves types and device correctly. """ @@ -26,6 +26,11 @@ def test_lightning_module_base_wrapper(tmpdir): assert module.dtype == torch.half assert model.dtype == torch.half + x = torch.randn((1, 32), dtype=torch.float) + out = module(x) + + assert out.dtype == torch.half + module.to(torch.double) assert module.dtype == torch.double assert model.dtype == torch.double From 7795fcbb4317b76b1d3c34cb50399dcba71fc254 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 19 Feb 2021 14:42:23 +0000 Subject: [PATCH 10/17] Added CHANGELOG.md --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ad54381a082b..98d3f9bfa8c16 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +Move lightning module to correct device type when using LightningDistributedWrapper ([#6070](https://github.com/PyTorchLightning/pytorch-lightning/pull/6070) + + ## [1.2.0] - 2021-02-18 ### Added From 98c152d043aed62b3ab9ddcb675f4c8e813f12fe Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 19 Feb 2021 15:36:22 +0000 Subject: [PATCH 11/17] Revert "Added CHANGELOG.md" This reverts commit 7795fcbb --- CHANGELOG.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 98d3f9bfa8c16..2ad54381a082b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,9 +22,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -Move lightning module to correct device type when using LightningDistributedWrapper ([#6070](https://github.com/PyTorchLightning/pytorch-lightning/pull/6070) - - ## [1.2.0] - 2021-02-18 ### Added From 47e606d9df3fc21ca1602fe0691e570f8b3e014b Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 19 Feb 2021 15:45:17 +0000 Subject: [PATCH 12/17] Move precision check into a separate test that requires cuda --- tests/plugins/test_deepspeed_plugin.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 55825d311030e..c6c365a08b753 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -26,7 +26,25 @@ def test_deepspeed_lightning_module(tmpdir): assert module.dtype == torch.half assert model.dtype == torch.half - x = torch.randn((1, 32), dtype=torch.float) + module.to(torch.double) + assert module.dtype == torch.double + assert model.dtype == torch.double + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine") +def test_deepspeed_lightning_module_precision(tmpdir): + """ + Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves tensors to half when precision 16. + """ + + model = BoringModel() + module = LightningDeepSpeedModule(model, precision=16) + + module.cuda().half() + assert module.dtype == torch.half + assert model.dtype == torch.half + + x = torch.randn((1, 32), dtype=torch.float).cuda() out = module(x) assert out.dtype == torch.half From 23824aa4664f5d28ee875bac176c1cf8922b768d Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 19 Feb 2021 19:14:33 +0000 Subject: [PATCH 13/17] Provide ZeRO config --- tests/plugins/test_deepspeed_plugin.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index c6c365a08b753..fbb53974efd33 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -75,6 +75,11 @@ def deepspeed_config(): } +@pytest.fixture +def deepspeed_zero_config(deepspeed_config): + return {**deepspeed_config, 'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2}} + + @pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.") def test_deepspeed_plugin_string(tmpdir): """ @@ -266,7 +271,7 @@ def on_train_start(self) -> None: @pytest.mark.skipif( not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest" ) -def test_deepspeed_config(tmpdir, deepspeed_config): +def test_deepspeed_config(tmpdir, deepspeed_zero_config): """ Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers and saves the model weights to load correctly. @@ -275,15 +280,18 @@ def test_deepspeed_config(tmpdir, deepspeed_config): class TestModel(BoringModel): def on_train_start(self) -> None: - import deepspeed - assert isinstance(self.trainer.optimizers[0], torch.optim.SGD) + from deepspeed.runtime.lr_schedules import WarmupLR + from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer + + assert isinstance(self.trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer) + assert isinstance(self.trainer.optimizers[0].optimizer, torch.optim.SGD) assert self.trainer.lr_schedulers == [] # DeepSpeed manages LR scheduler internally - assert isinstance(self.trainer.model.optimizer, torch.optim.SGD) - assert isinstance(self.trainer.model.lr_scheduler, deepspeed.runtime.lr_schedules.WarmupLR) + # Ensure DeepSpeed engine has initialized with our optimizer/lr_scheduler + assert isinstance(self.trainer.model.lr_scheduler, WarmupLR) model = TestModel() trainer = Trainer( - plugins=[DeepSpeedPlugin(config=deepspeed_config)], + plugins=[DeepSpeedPlugin(config=deepspeed_zero_config)], default_root_dir=tmpdir, gpus=1, fast_dev_run=True, From 7a6cd1e095025f96fe2b9b34f92695118cc67702 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 19 Feb 2021 19:57:41 +0000 Subject: [PATCH 14/17] Revert "Revert "Added CHANGELOG.md"" This reverts commit 98c152d0 --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7dad863d41293..69e80203f12da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed incorrect yield logic for the amp autocast context manager ([#6080](https://github.com/PyTorchLightning/pytorch-lightning/pull/6080)) +Move lightning module to correct device type when using LightningDistributedWrapper ([#6070](https://github.com/PyTorchLightning/pytorch-lightning/pull/6070) + + ## [1.2.0] - 2021-02-18 ### Added From 49ec362156c23372d049a851b7d0e9ed7df153ec Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 19 Feb 2021 20:06:42 +0000 Subject: [PATCH 15/17] Support torch device as input to cuda, as is with upstream pytorch --- pytorch_lightning/utilities/device_dtype_mixin.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/utilities/device_dtype_mixin.py b/pytorch_lightning/utilities/device_dtype_mixin.py index 6408c6e21cad4..3e3eccc93b368 100644 --- a/pytorch_lightning/utilities/device_dtype_mixin.py +++ b/pytorch_lightning/utilities/device_dtype_mixin.py @@ -119,7 +119,7 @@ def to(self, *args, **kwargs) -> Module: self.__update_properties(device=out[0], dtype=out[1]) return super().to(*args, **kwargs) - def cuda(self, device: Optional[int] = None) -> Module: + def cuda(self, device: Optional[Union[torch.device, int]] = None) -> Module: """Moves all model parameters and buffers to the GPU. This also makes associated parameters and buffers different objects. So it should be called before constructing optimizer if the module will @@ -132,7 +132,8 @@ def cuda(self, device: Optional[int] = None) -> Module: Returns: Module: self """ - self.__update_properties(device=torch.device('cuda', index=device)) + property_device = device if isinstance(device, torch.device) else torch.device('cuda', index=device) + self.__update_properties(device=property_device) return super().cuda(device=device) def cpu(self) -> Module: From 71ea9bf7f4ca5512b6b6537268e54860fed9a4a7 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Sat, 20 Feb 2021 20:10:40 +0000 Subject: [PATCH 16/17] Modify test to include all possible cuda variations --- tests/utilities/test_dtype_device_mixin.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/utilities/test_dtype_device_mixin.py b/tests/utilities/test_dtype_device_mixin.py index 17e208022a5ac..45a85744f0415 100644 --- a/tests/utilities/test_dtype_device_mixin.py +++ b/tests/utilities/test_dtype_device_mixin.py @@ -101,12 +101,19 @@ def test_submodules_multi_gpu_ddp_spawn(tmpdir): trainer.fit(model) +@pytest.mark.parametrize( + ['device'], + [ + pytest.param(None), # explicitly call without an index to see if the returning device contains an index + pytest.param(0), + pytest.param(torch.device('cuda', 0)), + ] +) @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") -def test_gpu_device_includes_index(): +def test_gpu_cuda_device(device): model = TopModule() - # explicitly call without an index to see if the returning device contains an index (it should!) - model.cuda() + model.cuda(device) device = model.device assert device.type == 'cuda' From 17715c7f24f1c12ffdffe9d33b471c348ed893ed Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Sat, 20 Feb 2021 23:52:20 +0000 Subject: [PATCH 17/17] Trigger Build