Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds tests to make sure logging doesn't happen multiple times #3899

Merged
merged 8 commits into from
Oct 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pytorch_lightning/trainer/training_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def setup_training(self, model: LightningModule):
if self.trainer.data_parallel:
ref_model = model.module

# set the ranks and devices
self.trainer.accelerator_backend.dist.rank = self.trainer.global_rank
self.trainer.accelerator_backend.dist.device = ref_model.device

Expand All @@ -125,7 +126,7 @@ def setup_training(self, model: LightningModule):

# log hyper-parameters
if self.trainer.logger is not None:
# save exp to get started
# save exp to get started (this is where the first experiment logs are written)
self.trainer.logger.log_hyperparams(ref_model.hparams)
self.trainer.logger.log_graph(ref_model)
self.trainer.logger.save()
Expand Down
59 changes: 59 additions & 0 deletions tests/trainer/logging/test_distributed_logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import pytest
import torch
from tests.base import BoringModel
import platform
from distutils.version import LooseVersion
from pytorch_lightning import Trainer, Callback
from unittest import mock


class TestModel(BoringModel):

def on_pretrain_routine_end(self) -> None:
with mock.patch('pytorch_lightning.loggers.base.LightningLoggerBase.agg_and_log_metrics') as m:
self.trainer.logger_connector.log_metrics({'a': 2}, {})
logged_times = m.call_count
expected = 1 if self.global_rank == 0 else 0
assert logged_times == expected, 'actual logger called from non-global zero'


@pytest.mark.skipif(platform.system() == "Windows",
reason="Distributed training is not supported on Windows")
@pytest.mark.skipif((platform.system() == "Darwin" and
LooseVersion(torch.__version__) < LooseVersion("1.3.0")),
reason="Distributed training is not supported on MacOS before Torch 1.3.0")
def test_global_zero_only_logging_ddp_cpu(tmpdir):
"""
Makes sure logging only happens from root zero
"""
model = TestModel()
model.training_epoch_end = None
trainer = Trainer(
distributed_backend='ddp_cpu',
num_processes=2,
default_root_dir=tmpdir,
limit_train_batches=1,
limit_val_batches=1,
max_epochs=1,
weights_summary=None,
)
trainer.fit(model)


@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
def test_global_zero_only_logging_ddp_spawn(tmpdir):
"""
Makes sure logging only happens from root zero
"""
model = TestModel()
model.training_epoch_end = None
trainer = Trainer(
distributed_backend='ddp_spawn',
gpus=2,
default_root_dir=tmpdir,
limit_train_batches=1,
limit_val_batches=1,
max_epochs=1,
weights_summary=None,
)
trainer.fit(model)