From b730a5a281f3af077c7378d5952951f68aa2c34e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 26 Mar 2021 15:58:05 +0100 Subject: [PATCH 1/3] Do not describe when there's no summary (#6681) --- pytorch_lightning/profiler/profilers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py index bc9e3541dbaa8a..78327fa0a91d8b 100644 --- a/pytorch_lightning/profiler/profilers.py +++ b/pytorch_lightning/profiler/profilers.py @@ -148,7 +148,9 @@ def describe(self) -> None: # so to avoid them, we open and close the files within this function # by calling `_prepare_streams` and `teardown` self._prepare_streams() - self._write_stream(self.summary()) + summary = self.summary() + if summary: + self._write_stream(summary) if self._output_file is not None: self._output_file.flush() self.teardown(stage=self._stage) From 21fc5eb21e6db07bcc222afa4204b3d5fb5be323 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 26 Mar 2021 18:04:59 +0100 Subject: [PATCH 2/3] Automatically find and run special tests (#6669) --- azure-pipelines.yml | 2 +- benchmarks/test_sharded_parity.py | 150 ++++++--------------- tests/accelerators/__init__.py | 12 -- tests/accelerators/test_ddp.py | 15 +-- tests/accelerators/test_multi_nodes_gpu.py | 7 + tests/special_tests.sh | 81 +++++++---- tests/utilities/test_all_gather_grad.py | 6 +- 7 files changed, 109 insertions(+), 164 deletions(-) mode change 100644 => 100755 tests/special_tests.sh diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d88a31ae9775a3..c48faadc4d976c 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -82,7 +82,7 @@ jobs: displayName: 'Testing: standard' - bash: | - sh tests/special_tests.sh + bash tests/special_tests.sh displayName: 'Testing: special' - bash: | diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py index 231556079e1ed3..28cbd7828b1087 100644 --- a/benchmarks/test_sharded_parity.py +++ b/benchmarks/test_sharded_parity.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import time from typing import Type @@ -21,113 +20,13 @@ from pytorch_lightning import seed_everything, Trainer from pytorch_lightning.plugins import DDPSpawnShardedPlugin -from tests.accelerators import DDPLauncher from tests.helpers.boring_model import BoringModel, RandomDataset from tests.helpers.runif import RunIf -@RunIf(min_gpus=1, skip_windows=True, fairscale=True) -def test_ddp_sharded_plugin_correctness_one_gpu(): - plugin_parity_test( - gpus=1, - model_cls=SeedTrainLoaderModel, - ) - - -@RunIf(min_gpus=1, skip_windows=True, fairscale=True, amp_native=True) -def test_ddp_sharded_plugin_correctness_amp_one_gpu(): - plugin_parity_test( - gpus=1, - precision=16, - model_cls=SeedTrainLoaderModel, - ) - - -@pytest.mark.skip(reason="Not a critical test, skip till drone CI performance improves.") -@RunIf(min_gpus=2, skip_windows=True, fairscale=True) -def test_ddp_sharded_plugin_correctness_multi_gpu(): - plugin_parity_test( - gpus=2, - model_cls=SeedTrainLoaderModel, - max_percent_speed_diff=0.25, # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers - ) - - -@RunIf(min_gpus=2, skip_windows=True, fairscale=True, amp_native=True) -def test_ddp_sharded_plugin_correctness_amp_multi_gpu(): - plugin_parity_test( - gpus=2, - precision=16, - model_cls=SeedTrainLoaderModel, - max_percent_speed_diff=0.25, # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers - ) - - -@RunIf(min_gpus=2, skip_windows=True, fairscale=True, amp_native=True) -def test_ddp_string_sharded_plugin_correctness_amp_multi_gpu(): - plugin_parity_test( - gpus=2, - precision=16, - model_cls=SeedTrainLoaderModel, - max_percent_speed_diff=0.25, # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers - ) - - -@RunIf(min_gpus=2, fairscale=True) -@pytest.mark.skipif( - not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest" -) -@DDPLauncher.run("--accelerator ddp --gpus 2 --precision 32") -def test_ddp_sharded_plugin_correctness_multi_gpu_ddp(tmpdir, args=None): - plugin_parity_test( - gpus=args.gpus, - precision=args.precision, - model_cls=SeedTrainLoaderModel, - ) - - -@RunIf(min_gpus=2, fairscale=True) -@pytest.mark.skipif( - not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest" -) -@DDPLauncher.run("--accelerator ddp --gpus 2 --precision 16") -def test_ddp_sharded_plugin_correctness_amp_multi_gpu_ddp(tmpdir, args=None): - plugin_parity_test( - gpus=args.gpus, - precision=args.precision, - model_cls=SeedTrainLoaderModel, - ) - - -@pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.") -@RunIf(min_gpus=2, skip_windows=True, fairscale=True) -def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim(): - """ - Ensures same results using multiple optimizers across multiple GPUs - """ - plugin_parity_test( - gpus=2, - model_cls=SeedTrainLoaderMultipleOptimizersModel, - max_percent_speed_diff=0.25, # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers - ) - - -@pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.") -@RunIf(min_gpus=2, skip_windows=True, fairscale=True) -def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim_manual(tmpdir): - """ - Ensures using multiple optimizers across multiple GPUs with manual optimization - """ - plugin_parity_test( - gpus=2, - model_cls=SeedTrainLoaderManualModel, - max_percent_speed_diff=0.25, # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers - ) - - class SeedTrainLoaderModel(BoringModel): """ - Overrides training loader to ensure we enforce the same seed for all DDP processes. + Overrides training loader to ensure we enforce the same seed for all DDP processes. """ def train_dataloader(self): @@ -177,7 +76,7 @@ class SeedTrainLoaderMultipleOptimizersModel(SeedTrainLoaderModel): def training_step(self, batch, batch_idx, optimizer_idx): output = self.layer(batch) loss = self.loss(batch, output) - return {"loss": loss} + return {'loss': loss} def training_epoch_end(self, outputs) -> None: # outputs should be an array with an entry per optimizer @@ -279,11 +178,48 @@ def plugin_parity_test( # Assert speed parity by ensuring percentage difference between custom/ddp is below threshold percent_diff = (custom_model_time - ddp_time) / custom_model_time - assert percent_diff <= max_percent_speed_diff, \ - f'Custom DDP plugin was too slow compared to DDP, Custom Plugin Time: {custom_model_time}, DDP Time: {ddp_time}' + assert ( + percent_diff <= max_percent_speed_diff + ), f'Custom DDP plugin was too slow compared to DDP, Custom Plugin Time: {custom_model_time}, DDP Time: {ddp_time}' if use_cuda: # Assert CUDA memory parity - assert max_memory_custom <= max_memory_ddp, \ - f'Custom plugin used too much memory compared to DDP,' \ + assert max_memory_custom <= max_memory_ddp, ( + 'Custom plugin used too much memory compared to DDP, ' f'Custom Mem: {max_memory_custom}, DDP Mem: {max_memory_ddp}' + ) + + +@RunIf(skip_windows=True, fairscale=True) +@pytest.mark.parametrize( + 'kwargs', + [ + pytest.param(dict(gpus=1, model_cls=SeedTrainLoaderModel), marks=RunIf(min_gpus=1)), + pytest.param( + dict(gpus=1, precision=16, model_cls=SeedTrainLoaderModel), marks=RunIf(min_gpus=1, amp_native=True) + ), + pytest.param(dict(gpus=2, model_cls=SeedTrainLoaderModel), marks=RunIf(min_gpus=2)), + pytest.param( + dict(gpus=2, precision=16, model_cls=SeedTrainLoaderModel), marks=RunIf(min_gpus=2, amp_native=True) + ), + pytest.param( + dict(gpus=2, model_cls=SeedTrainLoaderMultipleOptimizersModel), + marks=[ + RunIf(min_gpus=2), + pytest.mark.skip(reason='TODO: Current issue with multiple optimizers and FairScale.'), + ], + ), + pytest.param( + dict(gpus=2, model_cls=SeedTrainLoaderManualModel), + marks=[ + RunIf(min_gpus=2), + pytest.mark.skip(reason='TODO: Current issue with multiple optimizers and FairScale.'), + ], + ), + ], +) +def test_ddp_spawn_sharded_plugin(kwargs): + if kwargs['gpus'] > 1: + # TODO: decrease speed diff since only 2 GPUs sharding 2 optimizers + kwargs['max_percent_speed_diff'] = 0.25 + plugin_parity_test(**kwargs) diff --git a/tests/accelerators/__init__.py b/tests/accelerators/__init__.py index 9583ec95374370..e69de29bb2d1d6 100644 --- a/tests/accelerators/__init__.py +++ b/tests/accelerators/__init__.py @@ -1,12 +0,0 @@ -try: - from dtrun.launcher import DDPLauncher -except ImportError: - - class DDPLauncher: - - def run(cmd_line, **kwargs): - - def inner(func): - pass - - return inner diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py index 541110ac8846b8..06aed2c1020fff 100644 --- a/tests/accelerators/test_ddp.py +++ b/tests/accelerators/test_ddp.py @@ -20,7 +20,7 @@ import torch from pytorch_lightning import Trainer -from tests.accelerators import ddp_model, DDPLauncher +from tests.accelerators import ddp_model from tests.helpers.boring_model import BoringModel from tests.helpers.runif import RunIf from tests.utilities.distributed import call_training_script @@ -71,19 +71,6 @@ def test_multi_gpu_model_ddp_fit_test(tmpdir): assert out['test_acc'] > 0.7 -@RunIf(min_gpus=2) -@DDPLauncher.run( - "--max_epochs [max_epochs] --gpus 2 --accelerator [accelerator]", - max_epochs=["1"], - accelerator=["ddp", "ddp_spawn"] -) -def test_cli_to_pass(tmpdir, args=None): - """ - This test verify we can call function using test_cli name - """ - return '1' - - @RunIf(skip_windows=True) @pytest.mark.skipif(torch.cuda.is_available(), reason="test doesn't requires GPU machine") def test_torch_distributed_backend_env_variables(tmpdir): diff --git a/tests/accelerators/test_multi_nodes_gpu.py b/tests/accelerators/test_multi_nodes_gpu.py index 3a5ba5f0d1d717..c086150a605280 100644 --- a/tests/accelerators/test_multi_nodes_gpu.py +++ b/tests/accelerators/test_multi_nodes_gpu.py @@ -15,6 +15,7 @@ import sys from unittest import mock +import pytest import torch from tests.helpers.runif import RunIf @@ -28,6 +29,9 @@ from tests.helpers.boring_model import BoringModel # noqa: E402 +# TODO(Borda): When multi-node tests are re-enabled (.github/workflows/ci_test-mnodes.yml) +# use an environment variable `PL_RUNNING_MULTINODE_TESTS` and set `RunIf(multinode=True)` +@pytest.mark.skip("Multi-node testing is currently disabled") @RunIf(special=True) def test_logging_sync_dist_true_ddp(tmpdir): """ @@ -65,6 +69,9 @@ def validation_step(self, batch, batch_idx): assert trainer.logged_metrics['bar'] == fake_result +# TODO(Borda): When multi-node tests are re-enabled (.github/workflows/ci_test-mnodes.yml) +# use an environment variable `PL_RUNNING_MULTINODE_TESTS` and set `RunIf(multinode=True)` +@pytest.mark.skip("Multi-node testing is currently disabled") @RunIf(special=True) @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) def test__validation_step__log(tmpdir): diff --git a/tests/special_tests.sh b/tests/special_tests.sh old mode 100644 new mode 100755 index c381b5e9feeb6a..aa5d65844a1c52 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -1,3 +1,4 @@ +#!/bin/bash # Copyright The PyTorch Lightning team. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -11,32 +12,58 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# Running special tests set -e + +# this environment variable allows special tests to run export PL_RUNNING_SPECIAL_TESTS=1 -DEFAULTS="-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no" -python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp -python ${DEFAULTS} tests/models/test_sync_batchnorm.py::test_sync_batchnorm_ddp -python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_invalid_deepspeed_defaults_no_precision -python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_warn_deepspeed_override_backward -python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_run_configure_optimizers -python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_config -python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_custom_precision_params -python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_assert_config_zero_offload_disabled -python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_multigpu -python ${DEFAULTS} tests/plugins/test_rpc_plugin.py::test_rpc_function_calls_ddp -python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual -python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual_amp -python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_automatic -python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_with_wrong_balance -python ${DEFAULTS} tests/utilities/test_all_gather_grad.py::test_all_gather_collection -python ${DEFAULTS} tests/trainer/test_trainer.py::test_trainer_predict_ddp -python ${DEFAULTS} tests/trainer/test_trainer.py::test_trainer_predict_dp -python ${DEFAULTS} tests/trainer/logging_/test_train_loop_logging_1_0.py::test_logging_sync_dist_true_ddp -python ${DEFAULTS} tests/callbacks/test_pruning.py::test_pruning_callback_ddp -python ${DEFAULTS} tests/test_profiler.py::test_pytorch_profiler_trainer_ddp -python ${DEFAULTS} tests/models/test_hooks.py::test_transfer_batch_hook_ddp -python ${DEFAULTS} tests/trainer/test_data_loading.py::test_replace_distrubuted_sampler_custom_dataloader_custom_batch_sampler -python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp_with_toggle_model -python ${DEFAULTS} tests/checkpointing/test_checkpoint_callback_frequency.py::test_top_k_ddp -nvprof --profile-from-start off -o trace_name.prof -- python ${DEFAULTS} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx +# python arguments +defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no' + +# find tests marked as `@RunIf(special=True)` +grep_output=$(grep --recursive --line-number --word-regexp 'tests' 'benchmarks' --regexp 'special=True') +# file paths +files=$(echo "$grep_output" | cut -f1 -d:) +files_arr=($files) +# line numbers +linenos=$(echo "$grep_output" | cut -f2 -d:) +linenos_arr=($linenos) + +# tests to skip - space separated +blocklist='test_pytorch_profiler_nested_emit_nvtx' +report='' + +for i in "${!files_arr[@]}"; do + file=${files_arr[$i]} + lineno=${linenos_arr[$i]} + + # get code from `@RunIf(special=True)` line to EOF + test_code=$(tail -n +"$lineno" "$file") + + # read line by line + while read -r line; do + # if it's a test + if [[ $line == def\ test_* ]]; then + # get the name + test_name=$(echo $line | cut -c 5- | cut -f1 -d\() + + # check blocklist + if echo $blocklist | grep --word-regexp "$test_name" > /dev/null; then + report+="Skipped\t$file:$lineno::$test_name\n" + break + fi + + # run the test + report+="Ran\t$file:$lineno::$test_name\n" + python ${defaults} "${file}::${test_name}" + break + fi + done < <(echo "$test_code") +done + +nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx + +# echo test report +printf '=%.s' {1..80} +printf "\n$report" +printf '=%.s' {1..80} +printf '\n' diff --git a/tests/utilities/test_all_gather_grad.py b/tests/utilities/test_all_gather_grad.py index d67c9473bbb2ea..6bad31634ce830 100644 --- a/tests/utilities/test_all_gather_grad.py +++ b/tests/utilities/test_all_gather_grad.py @@ -55,7 +55,6 @@ class TestModel(BoringModel): training_epoch_end_called = False def training_epoch_end(self, outputs) -> None: - self.training_epoch_end_called = True losses = torch.stack([x["loss"] for x in outputs]) gathered_loss = self.all_gather({ "losses_tensor_int": torch.rand(2, 2).int().t(), @@ -67,7 +66,7 @@ def training_epoch_end(self, outputs) -> None: "losses": losses, "losses_list": [losses, losses] }) - assert gathered_loss["losses_tensor_int"][0].dtype == torch.int64 + assert gathered_loss["losses_tensor_int"][0].dtype == torch.int32 assert gathered_loss["losses_tensor_float"][0].dtype == torch.float assert gathered_loss["losses_np_ndarray"][0].dtype == torch.int64 # torch.bool can't be all_gathered @@ -76,6 +75,7 @@ def training_epoch_end(self, outputs) -> None: assert gathered_loss["losses_int"][0].dtype == torch.int assert gathered_loss["losses_list"][0].numel() == 2 * len(losses) assert gathered_loss["losses"].numel() == 2 * len(losses) + self.training_epoch_end_called = True seed_everything(42) @@ -115,6 +115,6 @@ def training_step(self, batch, batch_idx): return loss model = TestModel() - trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=2) + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=2, accelerator="ddp") trainer.fit(model) assert model.training_step_called From 0e45220263f4e2045dfe7f68e3e0eaac0b2033d5 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Fri, 26 Mar 2021 18:33:11 +0000 Subject: [PATCH 3/3] [warning] Add warning when values are not being reduced (#6417) * add warning non reduced * add test * update test * update changelog * Update pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> * update Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> --- CHANGELOG.md | 2 + pytorch_lightning/core/step_result.py | 6 +++ .../logger_connector/epoch_result_store.py | 43 +++++++++++++++++-- .../logging_/test_train_loop_logging_1_0.py | 7 ++- 4 files changed, 54 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 780a8790b9fdd0..79b253061ddb22 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added +- Trigger warning when non-metric logged value with multi processes hasn't been reduced ([#6417](https://github.com/PyTorchLightning/pytorch-lightning/pull/6417)) + - Added a way to print to terminal without breaking up the progress bar ([#5470](https://github.com/PyTorchLightning/pytorch-lightning/pull/5470)) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 3961586f4946af..bd5d7fb3b0dc97 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -633,6 +633,12 @@ def rename_keys(self, map_dict: dict): meta[dest] = meta[source] del meta[source] + def get_non_metrics_keys(self): + """ + This function is used to filter metric keys for which the value isn't a Metric + """ + return [k for k, v in self.items() if not isinstance(v, Metric)] + def choose_last(x): if isinstance(x, (torch.Tensor, list)): diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 4a57b14efd89b3..e2ce66c86ecffd 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -11,8 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import logging from collections import defaultdict -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple from weakref import proxy import torch @@ -21,6 +22,19 @@ from pytorch_lightning.core.step_result import Result from pytorch_lightning.trainer.states import TrainerState from pytorch_lightning.utilities import DistributedType, LightningEnum +from pytorch_lightning.utilities.warnings import WarningCache + +log = logging.getLogger(__name__) + + +class MetricWarningCache(WarningCache): + + def __init__(self): + super().__init__() + self.warned_metrics = [] + + +warning_cache = MetricWarningCache() class ResultStoreType(LightningEnum): @@ -52,8 +66,10 @@ class HookResultStore: Those data structures enables us to reduce properly Result object when batch loop is finished. """ - def __init__(self, fx_name: str) -> None: + def __init__(self, fx_name: str, all_gather_fn: Callable, should_warn: bool) -> None: self._fx_name = fx_name + self._all_gather_fn = all_gather_fn + self._should_warn = should_warn self._internals = {} self._internals_reduced = {} self._internal_type = None @@ -109,6 +125,20 @@ def run_epoch_func(self, results, opt_metric, func_name, *args, **kwargs) -> Non func = getattr(opt_metric, func_name) metrics_to_log = func(*args, add_dataloader_idx=self.has_several_dataloaders, **kwargs) + if self._should_warn: + for non_metric_key in opt_metric.get_non_metrics_keys(): + if non_metric_key in metrics_to_log and non_metric_key not in warning_cache.warned_metrics: + metric = self._all_gather_fn(metrics_to_log[non_metric_key]) + if any(metric[0] != m for m in metric[1:]): + warning_cache.warn( + f"The value associated to the key {non_metric_key}: {metric.cpu().tolist()} " + "doesn't appear to be the same accross all processes. " + "HINT: One could either do: `self.log(..., sync_dist=True, sync_fn=torch.mean)`" + " to force mean reduction across processes which can be inaccurate or implement" + " a `torchmetrics.Metric`" + ) + warning_cache.warned_metrics.append(non_metric_key) + results.append(metrics_to_log) def get_epoch_from_func_name(self, func_name, *args, **kwargs) -> List[Dict]: @@ -227,6 +257,12 @@ class EpochResultStore: def __init__(self, trainer: 'pl.Trainer') -> None: self.trainer = proxy(trainer) + + # Add warning only for distributed (expect rpc as main worker is running the code). + _should_warn = trainer.accelerator_connector.is_distributed + _should_warn &= not trainer.training_type_plugin.rpc_enabled + self._should_warn = _should_warn + self.reset() def __getitem__(self, key: str) -> Any: @@ -278,7 +314,8 @@ def cache_result(self) -> None: info = self.info fx_name = info["fx_name"] - self._internals.setdefault(fx_name, HookResultStore(fx_name)) + all_gather_fn = self.trainer.lightning_module.all_gather + self._internals.setdefault(fx_name, HookResultStore(fx_name, all_gather_fn, self._should_warn)) # attach capture batch_size Result.attach_batch_size(self._batch_size, hook_result) diff --git a/tests/trainer/logging_/test_train_loop_logging_1_0.py b/tests/trainer/logging_/test_train_loop_logging_1_0.py index f8672eb4ec51eb..393aaacb723281 100644 --- a/tests/trainer/logging_/test_train_loop_logging_1_0.py +++ b/tests/trainer/logging_/test_train_loop_logging_1_0.py @@ -743,6 +743,7 @@ class TestLoggingSyncDistModel(BoringModel): def training_step(self, batch, batch_idx): acc = self.step(batch[0]) self.log('foo', 1, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='SUM') + self.log('cho', acc, on_step=False, on_epoch=True) return acc def validation_step(self, batch, batch_idx): @@ -763,8 +764,12 @@ def validation_step(self, batch, batch_idx): gpus=2, profiler="pytorch" ) - trainer.fit(model) + if os.getenv("LOCAL_RANK") == '0': + with pytest.warns(UserWarning, match="The value associated to the key cho:"): + trainer.fit(model) + else: + trainer.fit(model) assert trainer.logged_metrics['foo'] == 2 assert trainer.logged_metrics['bar'] == 2