diff --git a/CHANGELOG.md b/CHANGELOG.md index 40a7cf54676b5..980d2a450f786 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -312,6 +312,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Pass the `stage` argument of `Callback.{setup,teardown}` as a keyword ([#7973](https://github.com/PyTorchLightning/pytorch-lightning/pull/7973)) +- Fixed a DDP info message that was never shown ([#8111](https://github.com/PyTorchLightning/pytorch-lightning/pull/8111)) + ## [1.3.7] - 2021-06-22 diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 84850b8d01b12..b855d100b1f12 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -37,7 +37,7 @@ rank_zero_deprecation, rank_zero_warn, ) -from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp, sync_ddp_if_available +from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_only, ReduceOp, sync_ddp_if_available from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.seed import reset_seed @@ -233,13 +233,6 @@ def setup_distributed(self): # where to store ip_table self.init_ddp_connection() - # on world_size=0 let everyone know training is starting - if self.is_global_zero and not torch.distributed.is_initialized(): - log.info("-" * 100) - log.info(f"distributed_backend={self.distributed_backend}") - log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") - log.info("-" * 100) - # set the ranks and devices self.dist.rank = self.global_rank self.dist.device = self.root_device @@ -308,6 +301,14 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size) + # on rank=0 let everyone know training is starting + rank_zero_info( + f"{'-' * 100}\n" + f"distributed_backend={self.torch_distributed_backend}\n" + f"All DDP processes registered. Starting ddp with {self.world_size} processes\n" + f"{'-' * 100}\n" + ) + def pre_dispatch(self): # move the model to the correct device self.model_to_device() diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py index b71fc10609cdc..b61f9a6052630 100644 --- a/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py @@ -36,7 +36,7 @@ ) from pytorch_lightning.utilities.cloud_io import atomic_save from pytorch_lightning.utilities.cloud_io import load as pl_load -from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp, sync_ddp_if_available +from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_only, ReduceOp, sync_ddp_if_available from pytorch_lightning.utilities.seed import reset_seed if _TORCH_GREATER_EQUAL_1_8: @@ -182,13 +182,6 @@ def new_process(self, process_idx, trainer, mp_queue): # ... need to double check that it is the correct place # self.trainer.call_setup_hook(self.model) - # on world_size=0 let everyone know training is starting - if self.is_global_zero and not torch.distributed.is_initialized(): - log.info("-" * 100) - log.info(f"distributed_backend={self.distributed_backend}") - log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") - log.info("-" * 100) - # set the ranks and devices self.dist.rank = self.global_rank self.dist.device = self.root_device @@ -267,6 +260,14 @@ def init_ddp_connection(self, global_rank: Optional[int], world_size: Optional[i log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size) + # on rank=0 let everyone know training is starting + rank_zero_info( + f"{'-' * 100}\n" + f"distributed_backend={self.torch_distributed_backend}\n" + f"All DDP processes registered. Starting ddp with {self.world_size} processes\n" + f"{'-' * 100}\n" + ) + def determine_ddp_device_ids(self): if self.root_device.type == "cpu": return None