Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Average Pbar Metrics #4534

Merged
merged 17 commits into from
Nov 12, 2020
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pytorch_lightning/core/step_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ def log(
# sync across workers when using distributed training
sync_fn = sync_fn or sync_ddp_if_available
if sync_dist and isinstance(value, (torch.Tensor, numbers.Number)):
is_dist_initialized = torch.distributed.is_available() and torch.distributed.is_initialized()
tchaton marked this conversation as resolved.
Show resolved Hide resolved
# TODO: Find a way to make the reduction only once, so we don't need to clone.
value = value.clone() if is_dist_initialized else value
value = sync_fn(value, group=sync_dist_group, reduce_op=sync_dist_op)

if 'meta' not in self:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
"""
Tests to ensure that the training loop works with a scalar
"""
import torch
import os
import torch
import pytest

from pytorch_lightning import Trainer
from tests.base.deterministic_model import DeterministicModel
from tests.base import BoringModel


def test_training_step_scalar(tmpdir):
Expand Down Expand Up @@ -190,3 +192,49 @@ def test_train_step_epoch_end_scalar(tmpdir):
opt_closure_result = trainer.train_loop.training_step_and_backward(
batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens)
assert opt_closure_result['loss'].item() == 171


class DPPReduceMeanPbarModel(BoringModel):

logged = []

def training_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
loss /= loss.clone().detach()
self.log('self_log', loss, prog_bar=True, sync_dist=True)
return {"loss": loss, "progress_bar":{"loss_2": loss}}


@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
def test_dpp_reduce_mean_pbar(tmpdir):
os.environ['PL_DEV_DEBUG'] = '1'

model = DPPReduceMeanPbarModel()
model.training_step_end = None
model.training_epoch_end = None

distributed_backend = "ddp_spawn"
tchaton marked this conversation as resolved.
Show resolved Hide resolved

trainer = Trainer(
max_epochs=1,
default_root_dir=os.getcwd(),
limit_train_batches=10,
limit_test_batches=2,
limit_val_batches=2,
distributed_backend=distributed_backend,
gpus=2,
precision=32)
tchaton marked this conversation as resolved.
Show resolved Hide resolved

trainer.fit(model)

# TODO: Move this test to DDP. pbar_added_metrics is empty with ddp_spawn for some reasons

pbar_added_metrics = trainer.dev_debugger.pbar_added_metrics
is_in = False
for pbar_metrics in pbar_added_metrics:
if 'loss_2' in pbar_metrics:
is_in = True
assert pbar_metrics["loss_2"].item() == 1
if distributed_backend == "ddp":
tchaton marked this conversation as resolved.
Show resolved Hide resolved
assert is_in is True