From 655969f09a889ed48b979f535bea570f2efd28f7 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Thu, 18 Feb 2021 23:37:35 +0000
Subject: [PATCH 01/17] Enable ZeRO optimization, and make sure that the
 lightning module hook is called when we move to half precision

---
 pytorch_lightning/plugins/training_type/deepspeed.py |  3 +++
 tests/plugins/test_deepspeed_plugin.py               | 10 +++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 0f9a8378052a5..74e06d35f4318 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -49,6 +49,9 @@ def forward(self, *inputs, **kwargs):
 
         return super().forward(*inputs, **kwargs)
 
+    def half(self):
+        self.module.half()
+
     @staticmethod
     def batch_to(data):
         return data.half()
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 9c9c5c097b4c5..00302eddecfa9 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -182,7 +182,7 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args
     trainer = Trainer(
         fast_dev_run=True,
         default_root_dir=tmpdir,
-        plugins=DeepSpeedPlugin(zero_optimization=False),
+        plugins=DeepSpeedPlugin(),
         gpus=1,
     )
     with pytest.warns(UserWarning, match='Overridden backward hook in the LightningModule will be ignored'):
@@ -210,7 +210,7 @@ def on_train_start(self) -> None:
 
     model = TestModel()
     trainer = Trainer(
-        plugins=DeepSpeedPlugin(zero_optimization=False),
+        plugins=DeepSpeedPlugin(),
         default_root_dir=tmpdir,
         gpus=1,
         fast_dev_run=True,
@@ -267,7 +267,7 @@ def test_deepspeed_multigpu(tmpdir, deepspeed_config):
     """
     model = BoringModel()
     trainer = Trainer(
-        plugins=[DeepSpeedPlugin(zero_optimization=False)],
+        plugins=[DeepSpeedPlugin()],
         default_root_dir=tmpdir,
         gpus=2,
         fast_dev_run=True,
@@ -285,8 +285,8 @@ def _assert_save_model_is_equal(model, tmpdir, trainer):
     # carry out the check only on rank 0
     if trainer.global_rank == 0:
         saved_model = BoringModel.load_from_checkpoint(checkpoint_path)
-        saved_model = saved_model.float()
-        model = model.float().cpu()
+        saved_model = saved_model.half()  # model is loaded in float32 as default, move it to float16
+        model = model.cpu()
         # Assert model parameters are identical after loading
         for orig_param, trained_model_param in zip(model.parameters(), saved_model.parameters()):
             assert torch.equal(orig_param, trained_model_param)

From 75a54e2d7d9d8451b69075278d5bf5e3b350e0af Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Thu, 18 Feb 2021 23:52:47 +0000
Subject: [PATCH 02/17] Added test, update to function

---
 .../plugins/training_type/deepspeed.py        |  3 +++
 tests/plugins/test_deepspeed_plugin.py        | 21 ++++++++++++++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 74e06d35f4318..5c5edfcf3c793 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -52,6 +52,9 @@ def forward(self, *inputs, **kwargs):
     def half(self):
         self.module.half()
 
+    def to(self, *args, **kwargs):
+        self.module.to(*args, **kwargs)
+
     @staticmethod
     def batch_to(data):
         return data.half()
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 00302eddecfa9..87709c62925aa 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -1,5 +1,6 @@
 import json
 import os
+from unittest.mock import patch
 
 import pytest
 import torch
@@ -8,11 +9,28 @@
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin
+from pytorch_lightning.plugins.training_type.deepspeed import LightningDeepSpeedModule
 from pytorch_lightning.utilities import _APEX_AVAILABLE, _DEEPSPEED_AVAILABLE, _NATIVE_AMP_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
 
 
+@patch.object(BoringModel, 'to')
+def test_deepspeed_wrapper(mocked_to, tmpdir):
+    """
+        Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves types and device correctly.
+    """
+
+    model = BoringModel()
+    module = LightningDeepSpeedModule(model, precision=16)
+
+    module.half()
+    assert model.dtype == torch.half
+
+    module.to('cuda')
+    assert mocked_to.called, "LightningDeepSpeedModule did not call LightningModule `to` hook when transferring device"
+
+
 @pytest.fixture
 def deepspeed_config():
     return {
@@ -285,7 +303,8 @@ def _assert_save_model_is_equal(model, tmpdir, trainer):
     # carry out the check only on rank 0
     if trainer.global_rank == 0:
         saved_model = BoringModel.load_from_checkpoint(checkpoint_path)
-        saved_model = saved_model.half()  # model is loaded in float32 as default, move it to float16
+        if model.dtype == torch.half:
+            saved_model = saved_model.half()  # model is loaded in float32 as default, move it to float16
         model = model.cpu()
         # Assert model parameters are identical after loading
         for orig_param, trained_model_param in zip(model.parameters(), saved_model.parameters()):

From c5413ab742ebf363a02f77c839e403f3f7f77a96 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Fri, 19 Feb 2021 10:17:55 +0000
Subject: [PATCH 03/17] Use device type mixin

---
 pytorch_lightning/plugins/training_type/deepspeed.py | 12 +++++-------
 tests/plugins/test_deepspeed_plugin.py               |  9 +++++----
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 5c5edfcf3c793..f6b3e7e392172 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -29,6 +29,7 @@
 from pytorch_lightning.trainer.optimizers import _get_default_scheduler_config
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.apply_func import apply_to_collection
+from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin
 from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_only
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _DEEPSPEED_AVAILABLE
@@ -37,7 +38,7 @@
     import deepspeed
 
 
-class LightningDeepSpeedModule(_LightningModuleWrapperBase):
+class LightningDeepSpeedModule(_LightningModuleWrapperBase, DeviceDtypeModuleMixin):
 
     def __init__(self, pl_module: LightningModule, precision: int):
         super().__init__(pl_module)
@@ -49,12 +50,6 @@ def forward(self, *inputs, **kwargs):
 
         return super().forward(*inputs, **kwargs)
 
-    def half(self):
-        self.module.half()
-
-    def to(self, *args, **kwargs):
-        self.module.to(*args, **kwargs)
-
     @staticmethod
     def batch_to(data):
         return data.half()
@@ -63,6 +58,9 @@ def _move_float_tensors_to_half(self, batch: Any):
         batch = apply_to_collection(batch, (torch.FloatTensor, torch.cuda.FloatTensor), function=self.batch_to)
         return batch
 
+    def on_post_move_to_device(self):
+        pass
+
 
 class DeepSpeedPlugin(DDPPlugin):
     distributed_backend = "deepspeed"
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 87709c62925aa..c5bb5c4f76574 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -15,8 +15,7 @@
 from tests.helpers.boring_model import BoringModel
 
 
-@patch.object(BoringModel, 'to')
-def test_deepspeed_wrapper(mocked_to, tmpdir):
+def test_lightning_module_base_wrapper(tmpdir):
     """
         Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves types and device correctly.
     """
@@ -25,10 +24,12 @@ def test_deepspeed_wrapper(mocked_to, tmpdir):
     module = LightningDeepSpeedModule(model, precision=16)
 
     module.half()
+    assert module.dtype == torch.half
     assert model.dtype == torch.half
 
-    module.to('cuda')
-    assert mocked_to.called, "LightningDeepSpeedModule did not call LightningModule `to` hook when transferring device"
+    module.to(torch.double)
+    assert module.dtype == torch.double
+    assert model.dtype == torch.double
 
 
 @pytest.fixture

From 1c1c114824bf776fc9858a3b63962b234efd905a Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Fri, 19 Feb 2021 11:04:25 +0000
Subject: [PATCH 04/17] Add precision

---
 tests/plugins/test_deepspeed_plugin.py | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index c5bb5c4f76574..995a53bcc1b9b 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -198,12 +198,7 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args
             return loss.backward()
 
     model = TestModel()
-    trainer = Trainer(
-        fast_dev_run=True,
-        default_root_dir=tmpdir,
-        plugins=DeepSpeedPlugin(),
-        gpus=1,
-    )
+    trainer = Trainer(fast_dev_run=True, default_root_dir=tmpdir, plugins=DeepSpeedPlugin(), gpus=1, precision=16)
     with pytest.warns(UserWarning, match='Overridden backward hook in the LightningModule will be ignored'):
         trainer.fit(model)
 
@@ -228,12 +223,7 @@ def on_train_start(self) -> None:
             assert isinstance(self.trainer.model.lr_scheduler, torch.optim.lr_scheduler.StepLR)
 
     model = TestModel()
-    trainer = Trainer(
-        plugins=DeepSpeedPlugin(),
-        default_root_dir=tmpdir,
-        gpus=1,
-        fast_dev_run=True,
-    )
+    trainer = Trainer(plugins=DeepSpeedPlugin(), default_root_dir=tmpdir, gpus=1, fast_dev_run=True, precision=16)
 
     trainer.fit(model)
 
@@ -266,6 +256,7 @@ def on_train_start(self) -> None:
         default_root_dir=tmpdir,
         gpus=1,
         fast_dev_run=True,
+        precision=16
     )
 
     trainer.fit(model)

From 1d1a1e1c3e80218ebc4c8a2f48c5986ce055ef66 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Fri, 19 Feb 2021 12:13:25 +0000
Subject: [PATCH 05/17] Turn off zero for checking optimizers are correct

---
 tests/plugins/test_deepspeed_plugin.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 995a53bcc1b9b..42bdc503cd52e 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -223,7 +223,12 @@ def on_train_start(self) -> None:
             assert isinstance(self.trainer.model.lr_scheduler, torch.optim.lr_scheduler.StepLR)
 
     model = TestModel()
-    trainer = Trainer(plugins=DeepSpeedPlugin(), default_root_dir=tmpdir, gpus=1, fast_dev_run=True, precision=16)
+    trainer = Trainer(
+        plugins=DeepSpeedPlugin(zero_optimization=False),  # disable ZeRO so our optimizers are not wrapped
+        default_root_dir=tmpdir,
+        gpus=1,
+        fast_dev_run=True
+    )
 
     trainer.fit(model)
 

From d3fcc09481297b3f59a5a64138ab4d39ae351a90 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Fri, 19 Feb 2021 12:25:03 +0000
Subject: [PATCH 06/17] Remove import

---
 tests/plugins/test_deepspeed_plugin.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 42bdc503cd52e..eade4e9864fae 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -1,6 +1,5 @@
 import json
 import os
-from unittest.mock import patch
 
 import pytest
 import torch

From f5d25fd084110c15588a1d8b1f5d39de5630ed46 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Fri, 19 Feb 2021 13:24:46 +0000
Subject: [PATCH 07/17] Use FP16 Wrapper

---
 tests/plugins/test_deepspeed_plugin.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index eade4e9864fae..54517d2b55729 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -216,17 +216,21 @@ def test_deepspeed_run_configure_optimizers(tmpdir):
     class TestModel(BoringModel):
 
         def on_train_start(self) -> None:
-            assert isinstance(self.trainer.optimizers[0], torch.optim.SGD)
+            from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer
+
+            assert isinstance(self.trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer)
+            assert isinstance(self.trainer.optimizers[0].optimizer, torch.optim.SGD)
             assert self.trainer.lr_schedulers == []  # DeepSpeed manages LR scheduler internally
             # Ensure DeepSpeed engine has initialized with our optimizer/lr_scheduler
             assert isinstance(self.trainer.model.lr_scheduler, torch.optim.lr_scheduler.StepLR)
 
     model = TestModel()
     trainer = Trainer(
-        plugins=DeepSpeedPlugin(zero_optimization=False),  # disable ZeRO so our optimizers are not wrapped
+        plugins=DeepSpeedPlugin(),  # disable ZeRO so our optimizers are not wrapped
         default_root_dir=tmpdir,
         gpus=1,
-        fast_dev_run=True
+        fast_dev_run=True,
+        precision=16
     )
 
     trainer.fit(model)

From cf6bd945fbd6478ec91f9b6a6d710be3e31df3b0 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Fri, 19 Feb 2021 14:33:08 +0000
Subject: [PATCH 08/17] Move mixin to the base class

---
 pytorch_lightning/overrides/base.py                  | 6 +++++-
 pytorch_lightning/plugins/training_type/deepspeed.py | 6 +-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/overrides/base.py b/pytorch_lightning/overrides/base.py
index 2fcb4b11a0b7f..c0b691bb07cb8 100644
--- a/pytorch_lightning/overrides/base.py
+++ b/pytorch_lightning/overrides/base.py
@@ -19,12 +19,13 @@
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.trainer.states import RunningStage
+from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin
 from pytorch_lightning.utilities.warnings import WarningCache
 
 warning_cache = WarningCache()
 
 
-class _LightningModuleWrapperBase(torch.nn.Module):
+class _LightningModuleWrapperBase(DeviceDtypeModuleMixin, torch.nn.Module):
 
     def __init__(self, pl_module: LightningModule):
         """
@@ -72,6 +73,9 @@ def forward(self, *inputs, **kwargs):
 
         return output
 
+    def on_post_move_to_device(self):
+        pass
+
 
 def warn_if_output_is_none(output: Any, method_name: str) -> None:
     """ Warns user about which method returned None. """
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index f6b3e7e392172..0f9a8378052a5 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -29,7 +29,6 @@
 from pytorch_lightning.trainer.optimizers import _get_default_scheduler_config
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.apply_func import apply_to_collection
-from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin
 from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_only
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _DEEPSPEED_AVAILABLE
@@ -38,7 +37,7 @@
     import deepspeed
 
 
-class LightningDeepSpeedModule(_LightningModuleWrapperBase, DeviceDtypeModuleMixin):
+class LightningDeepSpeedModule(_LightningModuleWrapperBase):
 
     def __init__(self, pl_module: LightningModule, precision: int):
         super().__init__(pl_module)
@@ -58,9 +57,6 @@ def _move_float_tensors_to_half(self, batch: Any):
         batch = apply_to_collection(batch, (torch.FloatTensor, torch.cuda.FloatTensor), function=self.batch_to)
         return batch
 
-    def on_post_move_to_device(self):
-        pass
-
 
 class DeepSpeedPlugin(DDPPlugin):
     distributed_backend = "deepspeed"

From 8969d179b5bd237d299bd078b1d5d58bddee19fd Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Fri, 19 Feb 2021 14:39:37 +0000
Subject: [PATCH 09/17] Better name for the test, test precision move

---
 tests/plugins/test_deepspeed_plugin.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 54517d2b55729..55825d311030e 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -14,7 +14,7 @@
 from tests.helpers.boring_model import BoringModel
 
 
-def test_lightning_module_base_wrapper(tmpdir):
+def test_deepspeed_lightning_module(tmpdir):
     """
         Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves types and device correctly.
     """
@@ -26,6 +26,11 @@ def test_lightning_module_base_wrapper(tmpdir):
     assert module.dtype == torch.half
     assert model.dtype == torch.half
 
+    x = torch.randn((1, 32), dtype=torch.float)
+    out = module(x)
+
+    assert out.dtype == torch.half
+
     module.to(torch.double)
     assert module.dtype == torch.double
     assert model.dtype == torch.double

From 7795fcbb4317b76b1d3c34cb50399dcba71fc254 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Fri, 19 Feb 2021 14:42:23 +0000
Subject: [PATCH 10/17] Added CHANGELOG.md

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2ad54381a082b..98d3f9bfa8c16 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Fixed
 
 
+Move lightning module to correct device type when using LightningDistributedWrapper ([#6070](https://github.com/PyTorchLightning/pytorch-lightning/pull/6070)
+
+
 ## [1.2.0] - 2021-02-18
 
 ### Added

From 98c152d043aed62b3ab9ddcb675f4c8e813f12fe Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Fri, 19 Feb 2021 15:36:22 +0000
Subject: [PATCH 11/17] Revert "Added CHANGELOG.md"

This reverts commit 7795fcbb
---
 CHANGELOG.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 98d3f9bfa8c16..2ad54381a082b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,9 +22,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Fixed
 
 
-Move lightning module to correct device type when using LightningDistributedWrapper ([#6070](https://github.com/PyTorchLightning/pytorch-lightning/pull/6070)
-
-
 ## [1.2.0] - 2021-02-18
 
 ### Added

From 47e606d9df3fc21ca1602fe0691e570f8b3e014b Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Fri, 19 Feb 2021 15:45:17 +0000
Subject: [PATCH 12/17] Move precision check into a separate test that requires
 cuda

---
 tests/plugins/test_deepspeed_plugin.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 55825d311030e..c6c365a08b753 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -26,7 +26,25 @@ def test_deepspeed_lightning_module(tmpdir):
     assert module.dtype == torch.half
     assert model.dtype == torch.half
 
-    x = torch.randn((1, 32), dtype=torch.float)
+    module.to(torch.double)
+    assert module.dtype == torch.double
+    assert model.dtype == torch.double
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
+def test_deepspeed_lightning_module_precision(tmpdir):
+    """
+        Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves tensors to half when precision 16.
+    """
+
+    model = BoringModel()
+    module = LightningDeepSpeedModule(model, precision=16)
+
+    module.cuda().half()
+    assert module.dtype == torch.half
+    assert model.dtype == torch.half
+
+    x = torch.randn((1, 32), dtype=torch.float).cuda()
     out = module(x)
 
     assert out.dtype == torch.half

From 23824aa4664f5d28ee875bac176c1cf8922b768d Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Fri, 19 Feb 2021 19:14:33 +0000
Subject: [PATCH 13/17] Provide ZeRO config

---
 tests/plugins/test_deepspeed_plugin.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index c6c365a08b753..fbb53974efd33 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -75,6 +75,11 @@ def deepspeed_config():
     }
 
 
+@pytest.fixture
+def deepspeed_zero_config(deepspeed_config):
+    return {**deepspeed_config, 'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2}}
+
+
 @pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
 def test_deepspeed_plugin_string(tmpdir):
     """
@@ -266,7 +271,7 @@ def on_train_start(self) -> None:
 @pytest.mark.skipif(
     not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
 )
-def test_deepspeed_config(tmpdir, deepspeed_config):
+def test_deepspeed_config(tmpdir, deepspeed_zero_config):
     """
         Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers
         and saves the model weights to load correctly.
@@ -275,15 +280,18 @@ def test_deepspeed_config(tmpdir, deepspeed_config):
     class TestModel(BoringModel):
 
         def on_train_start(self) -> None:
-            import deepspeed
-            assert isinstance(self.trainer.optimizers[0], torch.optim.SGD)
+            from deepspeed.runtime.lr_schedules import WarmupLR
+            from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer
+
+            assert isinstance(self.trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer)
+            assert isinstance(self.trainer.optimizers[0].optimizer, torch.optim.SGD)
             assert self.trainer.lr_schedulers == []  # DeepSpeed manages LR scheduler internally
-            assert isinstance(self.trainer.model.optimizer, torch.optim.SGD)
-            assert isinstance(self.trainer.model.lr_scheduler, deepspeed.runtime.lr_schedules.WarmupLR)
+            # Ensure DeepSpeed engine has initialized with our optimizer/lr_scheduler
+            assert isinstance(self.trainer.model.lr_scheduler, WarmupLR)
 
     model = TestModel()
     trainer = Trainer(
-        plugins=[DeepSpeedPlugin(config=deepspeed_config)],
+        plugins=[DeepSpeedPlugin(config=deepspeed_zero_config)],
         default_root_dir=tmpdir,
         gpus=1,
         fast_dev_run=True,

From 7a6cd1e095025f96fe2b9b34f92695118cc67702 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Fri, 19 Feb 2021 19:57:41 +0000
Subject: [PATCH 14/17] Revert "Revert "Added CHANGELOG.md""

This reverts commit 98c152d0
---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7dad863d41293..69e80203f12da 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,6 +24,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed incorrect yield logic for the amp autocast context manager ([#6080](https://github.com/PyTorchLightning/pytorch-lightning/pull/6080))
 
 
+Move lightning module to correct device type when using LightningDistributedWrapper ([#6070](https://github.com/PyTorchLightning/pytorch-lightning/pull/6070)
+
+
 ## [1.2.0] - 2021-02-18
 
 ### Added

From 49ec362156c23372d049a851b7d0e9ed7df153ec Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Fri, 19 Feb 2021 20:06:42 +0000
Subject: [PATCH 15/17] Support torch device as input to cuda, as is with
 upstream pytorch

---
 pytorch_lightning/utilities/device_dtype_mixin.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/utilities/device_dtype_mixin.py b/pytorch_lightning/utilities/device_dtype_mixin.py
index 6408c6e21cad4..3e3eccc93b368 100644
--- a/pytorch_lightning/utilities/device_dtype_mixin.py
+++ b/pytorch_lightning/utilities/device_dtype_mixin.py
@@ -119,7 +119,7 @@ def to(self, *args, **kwargs) -> Module:
         self.__update_properties(device=out[0], dtype=out[1])
         return super().to(*args, **kwargs)
 
-    def cuda(self, device: Optional[int] = None) -> Module:
+    def cuda(self, device: Optional[Union[torch.device, int]] = None) -> Module:
         """Moves all model parameters and buffers to the GPU.
         This also makes associated parameters and buffers different objects. So
         it should be called before constructing optimizer if the module will
@@ -132,7 +132,8 @@ def cuda(self, device: Optional[int] = None) -> Module:
         Returns:
             Module: self
         """
-        self.__update_properties(device=torch.device('cuda', index=device))
+        property_device = device if isinstance(device, torch.device) else torch.device('cuda', index=device)
+        self.__update_properties(device=property_device)
         return super().cuda(device=device)
 
     def cpu(self) -> Module:

From 71ea9bf7f4ca5512b6b6537268e54860fed9a4a7 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Sat, 20 Feb 2021 20:10:40 +0000
Subject: [PATCH 16/17] Modify test to include all possible cuda variations

---
 tests/utilities/test_dtype_device_mixin.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/tests/utilities/test_dtype_device_mixin.py b/tests/utilities/test_dtype_device_mixin.py
index 17e208022a5ac..45a85744f0415 100644
--- a/tests/utilities/test_dtype_device_mixin.py
+++ b/tests/utilities/test_dtype_device_mixin.py
@@ -101,12 +101,19 @@ def test_submodules_multi_gpu_ddp_spawn(tmpdir):
     trainer.fit(model)
 
 
+@pytest.mark.parametrize(
+    ['device'],
+    [
+        pytest.param(None),  # explicitly call without an index to see if the returning device contains an index
+        pytest.param(0),
+        pytest.param(torch.device('cuda', 0)),
+    ]
+)
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
-def test_gpu_device_includes_index():
+def test_gpu_cuda_device(device):
     model = TopModule()
 
-    # explicitly call without an index to see if the returning device contains an index (it should!)
-    model.cuda()
+    model.cuda(device)
 
     device = model.device
     assert device.type == 'cuda'

From 17715c7f24f1c12ffdffe9d33b471c348ed893ed Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Sat, 20 Feb 2021 23:52:20 +0000
Subject: [PATCH 17/17] Trigger Build