Merge branch 'release/1.2-dev' into accelerator-refactor-sharted-4

Lightning-AI · tchaton · Feb 16, 2021 · Jan 12, 2021 · Jan 12, 2021 · Jan 14, 2021
commit f017a397a954acaac8edc59f94c13baa2dd5e5e9
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -21,16 +21,12 @@
 from pytorch_lightning.accelerators.cpu import CPUAccelerator
 from pytorch_lightning.accelerators.gpu import GPUAccelerator
 from pytorch_lightning.accelerators.tpu import TPUAccelerator
-from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment
-from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment
 from pytorch_lightning.plugins import (
     ApexMixedPrecisionPlugin,
     DataParallelPlugin,
     DDP2Plugin,
     DDPPlugin,
-    DDPShardedPlugin,
     DDPSpawnPlugin,
-    DDPSpawnShardedPlugin,
     HorovodPlugin,
     NativeMixedPrecisionPlugin,
     PrecisionPlugin,
@@ -40,10 +36,13 @@
     TPUHalfPrecisionPlugin,
     TPUSpawnPlugin,
 )
+from pytorch_lightning.plugins.environments import SLURMEnvironment, TorchElasticEnvironment
 from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
 from pytorch_lightning.utilities import (
     _APEX_AVAILABLE,
+    _HOROVOD_AVAILABLE,
     _NATIVE_AMP_AVAILABLE,
+    _TPU_AVAILABLE,
     AMPType,
     device_parser,
     DeviceType,
@@ -53,39 +52,28 @@
 from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
-try:
-    import torch_xla.core.xla_model as xm
-except ImportError:
-    XLA_AVAILABLE = False
-else:
-    XLA_AVAILABLE = True
-
-try:
+if _HOROVOD_AVAILABLE:
     import horovod.torch as hvd
-except (ModuleNotFoundError, ImportError):
-    _HOROVOD_AVAILABLE = False
-else:
-    _HOROVOD_AVAILABLE = True
 
 
 class BackendConnector(object):
 
     def __init__(
-            self,
-            num_processes,
-            tpu_cores,
-            distributed_backend,
-            auto_select_gpus,
-            gpus,
-            num_nodes,
-            sync_batchnorm,
-            benchmark,
-            replace_sampler_ddp,
-            deterministic,
-            precision,
-            amp_type,
-            amp_level,
-            cluster_environment,
+        self,
+        num_processes,
+        tpu_cores,
+        distributed_backend,
+        auto_select_gpus,
+        gpus,
+        num_nodes,
+        sync_batchnorm,
+        benchmark,
+        replace_sampler_ddp,
+        deterministic,
+        precision,
+        amp_type,
+        amp_level,
+        cluster_environment,
     ):
         # initialization
         self._device_type = DeviceType.CPU
@@ -102,7 +90,7 @@ def __init__(
         self.replace_sampler_ddp = replace_sampler_ddp
         self.deterministic = deterministic
         self.precision = precision
-        self.amp_type = None if amp_type is None else amp_type.lower()
+        self.amp_type = amp_type.lower() if isinstance(amp_type, str) else None
         self.amp_level = amp_level
         self.cluster_environment = cluster_environment
         self.is_slurm_managing_tasks = False
@@ -203,7 +191,9 @@ def parallel_devices(self):
         if self.on_gpu:
             devices = [torch.device("cuda", i) for i in self.parallel_device_ids]
         elif self.on_tpu:
-            devices = [xm.xla_device(i) for i in self.parallel_device_ids]
+            # explicitly don't make a tpu device here!
+            # https://github.com/PyTorchLightning/pytorch-lightning/issues/3169
+            devices = [i for i in self.parallel_device_ids]
         else:
             devices = [torch.device("cpu")] * self.num_processes
         return devices
@@ -266,8 +256,8 @@ def select_training_type_plugin(self):
             use_ddp_cpu_spawn = self.use_ddp and self.on_cpu
             use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self.is_using_torchelastic
             use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks
-            use_ddp_sharded = self.distributed_backend == "ddp_sharded"
-            use_ddp_sharded_spawn = self.distributed_backend == "ddp_sharded_spawn"
+            # use_ddp_sharded = self.distributed_backend == "ddp_sharded"
+            # use_ddp_sharded_spawn = self.distributed_backend == "ddp_sharded_spawn"
 
             if self.on_tpu:
                 ddp_plugin_cls = TPUSpawnPlugin
@@ -277,11 +267,12 @@ def select_training_type_plugin(self):
             if os.environ.get("PL_IN_DDP_SUBPROCESS", False):
                 use_torchelastic_ddp = False
 
-            if use_ddp_sharded:
-                ddp_plugin_cls = DDPShardedPlugin
-            elif use_ddp_sharded_spawn:
-                ddp_plugin_cls = DDPSpawnShardedPlugin
-            elif use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp:
+            # fixme
+            # if use_ddp_sharded:
+            #     ddp_plugin_cls = DDPShardedPlugin
+            # elif use_ddp_sharded_spawn:
+            #     ddp_plugin_cls = DDPSpawnShardedPlugin
+            if use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp:
                 ddp_plugin_cls = DDPPlugin
             elif use_ddp_spawn or use_ddp_cpu_spawn:
                 ddp_plugin_cls = DDPSpawnPlugin
@@ -388,8 +379,8 @@ def set_distributed_mode(self):
 
         # for DDP overwrite nb processes by requested GPUs
         if (
-                self._device_type == DeviceType.GPU
-                and self._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
+            self._device_type == DeviceType.GPU
+            and self._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
         ):
             self.num_processes = self.num_gpus
 
@@ -407,7 +398,7 @@ def set_distributed_mode(self):
 
         rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}')
         num_cores = self.tpu_cores if self.tpu_cores is not None else 0
-        rank_zero_info(f'TPU available: {XLA_AVAILABLE}, using: {num_cores} TPU cores')
+        rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_cores} TPU cores')
 
         if torch.cuda.is_available() and self._device_type != DeviceType.GPU:
             rank_zero_warn("GPU available but not used. Set the --gpus flag when calling the script.")

diff --git a/pytorch_lightning/accelerators/cpu.py b/pytorch_lightning/accelerators/cpu.py
@@ -1,14 +1,15 @@
 from pytorch_lightning.accelerators.accelerator import Accelerator
-from pytorch_lightning.plugins import MixedPrecisionPlugin
+from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
 class CPUAccelerator(Accelerator):
+
     def setup(self, trainer, model):
         if isinstance(self.precision_plugin, MixedPrecisionPlugin):
             MisconfigurationException("amp + cpu is not supported.  Please use a GPU option")
 
         if "cpu" not in str(self.root_device):
             raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead")
 
-        return super().setup(trainer, model)
+        return super().setup(trainer, model)
diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
@@ -5,6 +5,7 @@
 
 
 class GPUAccelerator(Accelerator):
+
     def setup(self, trainer, model):
         if "cuda" not in str(self.root_device):
             raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead")
@@ -23,4 +24,4 @@ def on_train_start(self):
     def on_train_end(self):
         # clean up memory
         with torch.cuda.device(self.root_device):
-            torch.cuda.empty_cache()
+            torch.cuda.empty_cache()
diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py
@@ -1,10 +1,12 @@
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.accelerators.accelerator import Accelerator
-from pytorch_lightning.plugins.training_type import SingleTPUPlugin, TPUSpawnPlugin
 from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
+from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin
+from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
 class TPUAccelerator(Accelerator):
+
     def setup(self, trainer, model):
         if isinstance(self.precision_plugin, MixedPrecisionPlugin):
             raise MisconfigurationException(
@@ -14,4 +16,4 @@ def setup(self, trainer, model):
 
         if not isinstance(self.training_type_plugin, (SingleTPUPlugin, TPUSpawnPlugin)):
             raise MisconfigurationException("TPUs only support a single tpu core or tpu spawn training.")
-        return super().setup(trainer, model)
+        return super().setup(trainer, model)
diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py
@@ -1,4 +1,31 @@
 from pytorch_lightning.plugins.base_plugin import Plugin  # noqa: F401
-from pytorch_lightning.plugins.precision import *
-from pytorch_lightning.plugins.training_type import *
+from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin  # noqa: F401
+from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin  # noqa: F401
+from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin  # noqa: F401
+from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin  # noqa: F401
+from pytorch_lightning.plugins.precision.tpu_bfloat import TPUHalfPrecisionPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.ddp import DDPPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.ddp2 import DDP2Plugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin  # noqa: F401
 
+__all__ = [
+    "ApexMixedPrecisionPlugin",
+    "DataParallelPlugin",
+    "DDP2Plugin",
+    "DDPPlugin",
+    "DDPSpawnPlugin",
+    "HorovodPlugin",
+    "NativeMixedPrecisionPlugin",
+    "PrecisionPlugin",
+    "ShardedNativeMixedPrecisionPlugin",
+    "SingleDevicePlugin",
+    "SingleTPUPlugin",
+    "TPUHalfPrecisionPlugin",
+    "TPUSpawnPlugin",
+]
diff --git a/pytorch_lightning/plugins/environments/torchelastic_environment.py b/pytorch_lightning/plugins/environments/torchelastic_environment.py
@@ -15,8 +15,8 @@
 import os
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment
-from pytorch_lightning.utilities import rank_zero_warn, rank_zero_info
+from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
+from pytorch_lightning.utilities import rank_zero_warn
 
 
 class TorchElasticEnvironment(ClusterEnvironment):
@@ -46,18 +46,3 @@ def world_size(self):
 
     def local_rank(self):
         return int(os.environ['LOCAL_RANK'])
-
-    def node_rank(self):
-        # TODO: use GROUP_RANK and provide a default environment class that uses NODE_RANK
-        # torchelastic uses the envvar GROUP_RANK, whereas other systems(?) use NODE_RANK.
-        # otherwise use given node rank or default to node rank 0
-        env_vars = ['NODE_RANK', 'GROUP_RANK']
-        node_ids = [(k, os.environ.get(k, None)) for k in env_vars]
-        node_ids = [(k, v) for k, v in node_ids if v is not None]
-        if len(node_ids) == 0:
-            return 0
-        if len(node_ids) > 1:
-            log.warning(f"Multiple environment variables ({node_ids}) defined for node rank. Using the first one.")
-        k, rank = node_ids.pop()
-        rank_zero_info(f"Using environment variable {k} for node rank ({rank}).")
-        return int(rank)
diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
@@ -24,8 +24,8 @@
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.distributed import LightningDistributed
-from pytorch_lightning.overrides import LightningDistributedModule
-from pytorch_lightning.overrides.data_parallel import unwrap_lightning_module
+from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
+from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.utilities import _HYDRA_AVAILABLE
 from pytorch_lightning.utilities.distributed import (

diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -22,8 +22,8 @@
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.distributed.dist import LightningDistributed
-from pytorch_lightning.overrides import LightningDistributedModule
-from pytorch_lightning.overrides.data_parallel import unwrap_lightning_module
+from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
+from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
 from pytorch_lightning.utilities.cloud_io import atomic_save
 from pytorch_lightning.utilities.cloud_io import load as pl_load

diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py
@@ -1,44 +1,48 @@
+from typing import Any, Union
+
 import torch
+from torch._C import device
 
-from pytorch_lightning.plugins .training_type.training_type_plugin import TrainingTypePlugin
+from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin
 
 
 class SingleDevicePlugin(TrainingTypePlugin):
-    def __init__(self, device):
+
+    def __init__(self, device: torch.device) -> bool:
         super().__init__()
         self.device: torch.device = device
 
     @property
-    def on_tpu(self):
-        return self.device.type == 'xla'
+    def on_tpu(self) -> bool:
+        return False
 
     @property
-    def on_gpu(self):
+    def on_gpu(self) -> bool:
         return self.device.type == "cuda" and torch.cuda.is_available()
 
-    def reduce(self, output, *args, **kwargs):
+    def reduce(self, output: Union[Any, torch.Tensor], *args: Any, **kwargs: Any) -> Union[Any, torch.Tensor]:
         return output
 
     @property
-    def root_device(self):
+    def root_device(self) -> torch.device:
         return self.device
-    
-    def model_to_device(self):
+
+    def model_to_device(self) -> None:
         if self.on_gpu:
             torch.cuda.set_device(self.root_device)
 
         self._model.to(self.root_device)
 
-    def connect(self, model: torch.nn.Module):
+    def connect(self, model: torch.nn.Module) -> torch.nn.Module:
         self._model = model
         self.model_to_device()
         return self.model
 
     @property
-    def is_global_zero(self):
+    def is_global_zero(self) -> bool:
         return True
 
-    def barrier(self, *args, **kwargs):
+    def barrier(self, *args, **kwargs) -> None:
         pass
 
     def broadcast(self, obj: object, src: int = 0) -> object: