Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

resurface lost ddp info message #8111

Merged
merged 10 commits into from
Jun 27, 2021
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Pass the `stage` argument of `Callback.{setup,teardown}` as a keyword ([#7973](https://github.com/PyTorchLightning/pytorch-lightning/pull/7973))


- Fixed a DDP info message that was never shown ([#8111](https://github.com/PyTorchLightning/pytorch-lightning/pull/8111))


## [1.3.7] - 2021-06-22

Expand Down
15 changes: 7 additions & 8 deletions pytorch_lightning/plugins/training_type/ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
rank_zero_deprecation,
rank_zero_warn,
)
from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp, sync_ddp_if_available
from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_only, ReduceOp, sync_ddp_if_available
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.seed import reset_seed

Expand Down Expand Up @@ -233,13 +233,6 @@ def setup_distributed(self):
# where to store ip_table
self.init_ddp_connection()

# on world_size=0 let everyone know training is starting
if self.is_global_zero and not torch.distributed.is_initialized():
log.info("-" * 100)
log.info(f"distributed_backend={self.distributed_backend}")
log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
log.info("-" * 100)

# set the ranks and devices
self.dist.rank = self.global_rank
self.dist.device = self.root_device
Expand Down Expand Up @@ -308,6 +301,12 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt
log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size)

# on rank=0 let everyone know training is starting
rank_zero_info(("-" * 100,
f"distributed_backend={self.distributed_backend}",
f"All DDP processes registered. Starting ddp with {self.world_size} processes",
"-" * 100))

def pre_dispatch(self):
# move the model to the correct device
self.model_to_device()
Expand Down
15 changes: 7 additions & 8 deletions pytorch_lightning/plugins/training_type/ddp_spawn.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
)
from pytorch_lightning.utilities.cloud_io import atomic_save
from pytorch_lightning.utilities.cloud_io import load as pl_load
from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp, sync_ddp_if_available
from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_only, ReduceOp, sync_ddp_if_available
from pytorch_lightning.utilities.seed import reset_seed

if _TORCH_GREATER_EQUAL_1_8:
Expand Down Expand Up @@ -182,13 +182,6 @@ def new_process(self, process_idx, trainer, mp_queue):
# ... need to double check that it is the correct place
# self.trainer.call_setup_hook(self.model)

# on world_size=0 let everyone know training is starting
if self.is_global_zero and not torch.distributed.is_initialized():
log.info("-" * 100)
log.info(f"distributed_backend={self.distributed_backend}")
log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
log.info("-" * 100)

# set the ranks and devices
self.dist.rank = self.global_rank
self.dist.device = self.root_device
Expand Down Expand Up @@ -267,6 +260,12 @@ def init_ddp_connection(self, global_rank: Optional[int], world_size: Optional[i
log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size)

# on rank=0 let everyone know training is starting
rank_zero_info("-" * 100)
rank_zero_info(f"distributed_backend={self.distributed_backend}")
rank_zero_info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
rank_zero_info("-" * 100)

def determine_ddp_device_ids(self):
if self.root_device.type == "cpu":
return None
Expand Down