Lightning-AI · williamFalcon · Oct 6, 2020 · Oct 6, 2020 · Oct 6, 2020 · Oct 6, 2020
@@ -110,6 +110,7 @@ def setup_training(self, model: LightningModule):
         if self.trainer.data_parallel:
             ref_model = model.module
 
+        # set the ranks and devices
         self.trainer.accelerator_backend.dist.rank = self.trainer.global_rank
         self.trainer.accelerator_backend.dist.device = ref_model.device
 
@@ -125,7 +126,7 @@ def setup_training(self, model: LightningModule):
 
         # log hyper-parameters
         if self.trainer.logger is not None:
-            # save exp to get started
+            # save exp to get started (this is where the first experiment logs are written)
             self.trainer.logger.log_hyperparams(ref_model.hparams)
             self.trainer.logger.log_graph(ref_model)
             self.trainer.logger.save()

@@ -0,0 +1,59 @@
+import pytest
+import torch
+from tests.base import BoringModel
+import platform
+from distutils.version import LooseVersion
+from pytorch_lightning import Trainer, Callback
+from unittest import mock
+
+
+class TestModel(BoringModel):
+
+    def on_pretrain_routine_end(self) -> None:
+        with mock.patch('pytorch_lightning.loggers.base.LightningLoggerBase.agg_and_log_metrics') as m:
+            self.trainer.logger_connector.log_metrics({'a': 2}, {})
+            logged_times = m.call_count
+            expected = 1 if self.global_rank == 0 else 0
+            assert logged_times == expected, 'actual logger called from non-global zero'
+
+
+@pytest.mark.skipif(platform.system() == "Windows",
+                    reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif((platform.system() == "Darwin" and
+                     LooseVersion(torch.__version__) < LooseVersion("1.3.0")),
+                    reason="Distributed training is not supported on MacOS before Torch 1.3.0")
+def test_global_zero_only_logging_ddp_cpu(tmpdir):
+    """
+    Makes sure logging only happens from root zero
+    """
+    model = TestModel()
+    model.training_epoch_end = None
+    trainer = Trainer(
+        distributed_backend='ddp_cpu',
+        num_processes=2,
+        default_root_dir=tmpdir,
+        limit_train_batches=1,
+        limit_val_batches=1,
+        max_epochs=1,
+        weights_summary=None,
+    )
+    trainer.fit(model)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+def test_global_zero_only_logging_ddp_spawn(tmpdir):
+    """
+    Makes sure logging only happens from root zero
+    """
+    model = TestModel()
+    model.training_epoch_end = None
+    trainer = Trainer(
+        distributed_backend='ddp_spawn',
+        gpus=2,
+        default_root_dir=tmpdir,
+        limit_train_batches=1,
+        limit_val_batches=1,
+        max_epochs=1,
+        weights_summary=None,
+    )
+    trainer.fit(model)