Lightning-AI · awaelchli · May 8, 2021 · Mar 23, 2021 · Mar 23, 2021 · Mar 24, 2021
@@ -141,6 +141,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Changed warnings and recommendations for dataloaders in `ddp_spawn` ([#6762](https://github.com/PyTorchLightning/pytorch-lightning/pull/6762/))
 
 
+- Changed `resolve_training_type_plugins` to allow setting `num_nodes` and `sync_batchnorm` from `Trainer` setting ([7026](https://github.com/PyTorchLightning/pytorch-lightning/pull/7026))
+
+
 - `pl.seed_everyting` will now also set the seed on the `DistributedSampler` ([#7024](https://github.com/PyTorchLightning/pytorch-lightning/pull/7024))
 
 

@@ -72,13 +72,13 @@ def __init__(
     ) -> None:
         super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment)
         self.interactive_ddp_procs = []
-        self.num_nodes = num_nodes
+        self._num_nodes = num_nodes
         self.sync_batchnorm = sync_batchnorm
         self.dist = LightningDistributed()
+        self.num_processes = len(self.parallel_devices) if self.parallel_devices is not None else 0
         self._ddp_kwargs = kwargs
         self._has_spawned_children = False
         self.task_idx = None
-        self.num_processes = len(parallel_devices) if parallel_devices is not None else parallel_devices
         self._ddp_comm_state = ddp_comm_state
         self._ddp_comm_hook = ddp_comm_hook
         self._ddp_comm_wrapper = ddp_comm_wrapper
@@ -88,6 +88,17 @@ def __init__(
     def root_device(self):
         return self.parallel_devices[self.local_rank]
 
+    @property
+    def num_nodes(self) -> int:
+        return self._num_nodes
+
+    @num_nodes.setter
+    def num_nodes(self, num_nodes: int) -> None:
+        # note that world ranks is related to num_nodes, when resetting these parameters,
+        # need to reset world ranks
+        self._num_nodes = num_nodes
+        self.set_world_ranks()
+
     @property
     def distributed_sampler_kwargs(self):
         distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank)
@@ -215,10 +226,11 @@ def _check_can_spawn_children(self):
             )
 
     def set_world_ranks(self) -> None:
-        if self.cluster_environment is not None:
-            self.cluster_environment.set_global_rank(self.node_rank * self.num_processes + self.local_rank)
-            self.cluster_environment.set_world_size(self.num_nodes * self.num_processes)
-            rank_zero_only.rank = self.cluster_environment.global_rank()
+        if self.cluster_environment is None:
+            return
+        self.cluster_environment.set_global_rank(self.node_rank * self.num_processes + self.local_rank)
+        self.cluster_environment.set_world_size(self.num_nodes * self.num_processes)
+        rank_zero_only.rank = self.cluster_environment.global_rank()
 
     def pre_configure_ddp(self):
         # if unset, default `find_unused_parameters` `True`

@@ -72,6 +72,8 @@ def distributed_sampler_kwargs(self):
     def _is_single_process_single_device(self) -> bool:
         return False
 
-    def set_world_ranks(self):
+    def set_world_ranks(self) -> None:
+        if self.cluster_environment is None:
+            return
         self.cluster_environment.set_global_rank(self.node_rank)
         self.cluster_environment.set_world_size(self.num_nodes)
@@ -60,7 +60,7 @@ def __init__(
         **kwargs: Any,
     ):
         super().__init__(parallel_devices=parallel_devices, cluster_environment=cluster_environment)
-        self.num_nodes = num_nodes
+        self._num_nodes = num_nodes
         self.sync_batchnorm = sync_batchnorm
         self._ddp_kwargs = kwargs
         self.dist = LightningDistributed()
@@ -72,6 +72,17 @@ def __init__(
         self._local_rank = 0
         self.set_world_ranks()
 
+    @property
+    def num_nodes(self) -> int:
+        return self._num_nodes
+
+    @num_nodes.setter
+    def num_nodes(self, num_nodes: int) -> None:
+        # note that world ranks is related to num_nodes, when resetting these parameters,
+        # need to reset world ranks
+        self._num_nodes = num_nodes
+        self.set_world_ranks()
+
     @property
     def local_rank(self) -> int:
         return self._local_rank
@@ -106,10 +117,11 @@ def setup(self, model):
 
     def set_world_ranks(self, process_idx: int = 0) -> None:
         self._local_rank = process_idx
-        if self.cluster_environment is not None:
-            self.cluster_environment.set_global_rank(self.node_rank * self.num_processes + self.local_rank)
-            self.cluster_environment.set_world_size(self.num_nodes * self.num_processes)
-            rank_zero_only.rank = self.cluster_environment.global_rank()
+        if self.cluster_environment is None:
+            return
+        self.cluster_environment.set_global_rank(self.node_rank * self.num_processes + self.local_rank)
+        self.cluster_environment.set_world_size(self.num_nodes * self.num_processes)
+        rank_zero_only.rank = self.cluster_environment.global_rank()
 
     @property
     def mp_spawn_kwargs(self):

diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py
@@ -42,7 +42,7 @@ def __init__(
         self,
         rpc_timeout_sec: float = DEFAULT_RPC_TIMEOUT_SEC,
         parallel_devices: Optional[List[torch.device]] = None,
-        num_nodes: Optional[int] = None,
+        num_nodes: int = 1,
         cluster_environment: Optional[ClusterEnvironment] = None,
         sync_batchnorm: Optional[bool] = None,
         **kwargs

@@ -123,6 +123,7 @@ def __init__(
 
         self.handle_given_plugins(plugins)
 
+        self._training_type_plugin_resolved = False
         self.accelerator = self.select_accelerator()
 
         # override dist backend when using tpus
@@ -221,10 +222,14 @@ def precision_plugin(self) -> PrecisionPlugin:
 
     @property
     def training_type_plugin(self) -> TrainingTypePlugin:
+        if self._training_type_plugin_resolved:
+            # avoid calling `resolve_training_type_plugin` multiple times
+            return self._training_type_plugin
         if self._training_type_plugin is None:
             self._training_type_plugin = self.select_training_type_plugin()
         else:
             self._training_type_plugin = self.resolve_training_type_plugin(self._training_type_plugin)
+        self._training_type_plugin_resolved = True
 
         return self._training_type_plugin
 
@@ -315,7 +320,6 @@ def is_using_torchelastic(self) -> bool:
         """
         .. deprecated:: v1.3
             Will be removed in v1.5.0.
-
         Returns:
             ``True`` if the current process was launched using the torchelastic command.
         """
@@ -438,20 +442,20 @@ def select_training_type_plugin(self) -> TrainingTypePlugin:
 
     def resolve_training_type_plugin(self, training_type: TrainingTypePlugin) -> TrainingTypePlugin:
         # necessary for when the user has passed in a plugin
-        if hasattr(training_type, 'parallel_devices') and not getattr(training_type, 'parallel_devices'):
+        if hasattr(training_type, 'parallel_devices') and getattr(training_type, 'parallel_devices') is None:
             training_type.parallel_devices = self.parallel_devices
             if hasattr(training_type, 'num_processes'):
                 training_type.num_processes = len(self.parallel_devices)
 
         if hasattr(training_type, 'cluster_environment') and getattr(training_type, 'cluster_environment') is None:
             training_type.cluster_environment = self.select_cluster_environment()
 
-        if hasattr(training_type, 'num_nodes') and getattr(training_type, 'num_nodes') is None:
+        if hasattr(training_type, 'num_nodes'):
+            # set num_nodes for training_type from trainer setting
             training_type.num_nodes = self.num_nodes
 
-        # Automatically set sync_batchnorm if None.
-        # Useful for custom plugins.
-        if hasattr(training_type, 'sync_batchnorm') and getattr(training_type, 'sync_batchnorm') is None:
+        if hasattr(training_type, 'sync_batchnorm'):
+            # set sync_batchnorm for training_type from trainer setting
             training_type.sync_batchnorm = self.sync_batchnorm
 
         return training_type

@@ -47,13 +47,14 @@ def environment_combinations():
 
 
 @pytest.mark.parametrize(
-    "plugin_cls", [
+    "plugin_cls",
+    [
         DDPPlugin,
         DDPShardedPlugin,
         DDP2Plugin,
         pytest.param(DeepSpeedPlugin, marks=RunIf(deepspeed=True)),
         pytest.param(RPCSequentialPlugin, marks=RunIf(fairscale_pipe=True)),
-    ]
+    ],
 )
 def test_ranks_available_manual_plugin_selection(plugin_cls):
     """ Test that the rank information is readily available after Trainer initialization. """
@@ -66,10 +67,12 @@ def test_ranks_available_manual_plugin_selection(plugin_cls):
         with mock.patch.dict(os.environ, variables):
             plugin = plugin_cls(
                 parallel_devices=[torch.device("cuda", 1), torch.device("cuda", 2)],
-                num_nodes=num_nodes,
                 cluster_environment=cluster,
             )
-            trainer = Trainer(plugins=[plugin])
+            trainer = Trainer(
+                plugins=[plugin],
+                num_nodes=num_nodes,
+            )
             assert rank_zero_only.rank == expected["global_rank"]
             assert trainer.global_rank == expected["global_rank"]
             assert trainer.local_rank == expected["local_rank"]
@@ -78,13 +81,14 @@ def test_ranks_available_manual_plugin_selection(plugin_cls):
 
 
 @pytest.mark.parametrize(
-    "trainer_kwargs", [
+    "trainer_kwargs",
+    [
         dict(accelerator="ddp", gpus=[1, 2]),
         dict(accelerator="ddp_sharded", gpus=[1, 2]),
         dict(accelerator="ddp2", gpus=[1, 2]),
         dict(accelerator="ddp_cpu", num_processes=2),
         dict(accelerator="ddp_spawn", gpus=[1, 2]),
-    ]
+    ],
 )
 @mock.patch("torch.cuda.is_available", return_value=True)
 @mock.patch("torch.cuda.device_count", return_value=4)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -141,6 +141,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
		- Changed warnings and recommendations for dataloaders in `ddp_spawn` ([#6762](https://github.com/PyTorchLightning/pytorch-lightning/pull/6762/))


		- Changed `resolve_training_type_plugins` to allow setting `num_nodes` and `sync_batchnorm` from `Trainer` setting ([7026](https://github.com/PyTorchLightning/pytorch-lightning/pull/7026))


		- `pl.seed_everyting` will now also set the seed on the `DistributedSampler` ([#7024](https://github.com/PyTorchLightning/pytorch-lightning/pull/7024))


Expand Down