Skip to content

Commit

Permalink
Adds tests to make sure logging doesn't happen multiple times (#3899)
Browse files Browse the repository at this point in the history
* Makes sure logging doesn't ever happen from non-root zero

* Makes sure logging doesn't ever happen from non-root zero

* Makes sure logging doesn't ever happen from non-root zero

* added bug report model

* fix local model

* fix local model

* fix local model

* fix local model
  • Loading branch information
williamFalcon authored Oct 6, 2020
1 parent e4a56fa commit 2cf17a3
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 1 deletion.
3 changes: 2 additions & 1 deletion pytorch_lightning/trainer/training_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def setup_training(self, model: LightningModule):
if self.trainer.data_parallel:
ref_model = model.module

# set the ranks and devices
self.trainer.accelerator_backend.dist.rank = self.trainer.global_rank
self.trainer.accelerator_backend.dist.device = ref_model.device

Expand All @@ -125,7 +126,7 @@ def setup_training(self, model: LightningModule):

# log hyper-parameters
if self.trainer.logger is not None:
# save exp to get started
# save exp to get started (this is where the first experiment logs are written)
self.trainer.logger.log_hyperparams(ref_model.hparams)
self.trainer.logger.log_graph(ref_model)
self.trainer.logger.save()
Expand Down
59 changes: 59 additions & 0 deletions tests/trainer/logging/test_distributed_logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import pytest
import torch
from tests.base import BoringModel
import platform
from distutils.version import LooseVersion
from pytorch_lightning import Trainer, Callback
from unittest import mock


class TestModel(BoringModel):

def on_pretrain_routine_end(self) -> None:
with mock.patch('pytorch_lightning.loggers.base.LightningLoggerBase.agg_and_log_metrics') as m:
self.trainer.logger_connector.log_metrics({'a': 2}, {})
logged_times = m.call_count
expected = 1 if self.global_rank == 0 else 0
assert logged_times == expected, 'actual logger called from non-global zero'


@pytest.mark.skipif(platform.system() == "Windows",
reason="Distributed training is not supported on Windows")
@pytest.mark.skipif((platform.system() == "Darwin" and
LooseVersion(torch.__version__) < LooseVersion("1.3.0")),
reason="Distributed training is not supported on MacOS before Torch 1.3.0")
def test_global_zero_only_logging_ddp_cpu(tmpdir):
"""
Makes sure logging only happens from root zero
"""
model = TestModel()
model.training_epoch_end = None
trainer = Trainer(
distributed_backend='ddp_cpu',
num_processes=2,
default_root_dir=tmpdir,
limit_train_batches=1,
limit_val_batches=1,
max_epochs=1,
weights_summary=None,
)
trainer.fit(model)


@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
def test_global_zero_only_logging_ddp_spawn(tmpdir):
"""
Makes sure logging only happens from root zero
"""
model = TestModel()
model.training_epoch_end = None
trainer = Trainer(
distributed_backend='ddp_spawn',
gpus=2,
default_root_dir=tmpdir,
limit_train_batches=1,
limit_val_batches=1,
max_epochs=1,
weights_summary=None,
)
trainer.fit(model)

0 comments on commit 2cf17a3

Please sign in to comment.