From b3c19023f2cc85cd58d11ebd341c0eda2b323694 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 8 Mar 2020 00:48:08 +0100 Subject: [PATCH 01/22] check for nan values --- pytorch_lightning/trainer/training_loop.py | 35 +++++++++++++++++++--- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index c6f844370b477..03c0ec6238890 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -121,21 +121,23 @@ def training_step(self, batch, batch_idx): """ -from typing import Callable - import copy -import warnings import logging as log +import sys +import warnings from abc import ABC, abstractmethod +from typing import Callable from typing import Union, List import numpy as np +import torch +from torch import Tensor from torch.utils.data import DataLoader +from pytorch_lightning.callbacks.base import Callback from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.loggers import LightningLoggerBase from pytorch_lightning.utilities.debugging import MisconfigurationException -from pytorch_lightning.callbacks.base import Callback try: from apex import amp @@ -702,6 +704,9 @@ def training_forward(self, batch, batch_idx, opt_idx, hiddens): # format and reduce outputs accordingly output = self.process_output(output, train=True) + # check if loss or model weights are NaN + self.detect_nan(output[0]) + return output def update_learning_rates(self, interval): @@ -736,3 +741,25 @@ def call_checkpoint_callback(self): if self.checkpoint_callback is not None: self.checkpoint_callback.on_validation_end(self, self.get_model()) self.on_validation_end() + + def detect_nan(self, loss: Tensor) -> None: + # check if loss is NaN + if torch.any(torch.isnan(loss)): + warnings.warn( + 'The loss returned in `training_step` is NaN.' + ' Will stop training.', + UserWarning + ) + sys.exit() + # check if a network weight is NaN (only the ones we optimize for) + for name, param in self.model.named_parameters(): + if torch.any(torch.isnan(param)): + warnings.warn( + f'Detected NaN values in `{name}`.' + ' Check your forward pass for numerically unstable operations.' + ' Will stop training.', + UserWarning + ) + sys.exit() + + From a0a2417950cd7ed6edf07ace85dfc3cea8988367 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 8 Mar 2020 01:17:28 +0100 Subject: [PATCH 02/22] test nan detection on loss --- tests/test_cpu_models.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/test_cpu_models.py b/tests/test_cpu_models.py index 1d0b1d1320f56..ac8d4a9bca7d1 100644 --- a/tests/test_cpu_models.py +++ b/tests/test_cpu_models.py @@ -1,5 +1,6 @@ import warnings +import pytest import torch import tests.models.utils as tutils @@ -359,5 +360,33 @@ def test_single_gpu_model(tmpdir): tutils.run_model_test(trainer_options, model) +def test_nan_detection(tmpdir): + + class NanModel(LightTrainDataloader, TestModelBase): + + def __init__(self, hparams): + super().__init__(hparams) + + def training_step(self, batch, batch_idx): + output = super().training_step(batch, batch_idx) + if isinstance(output, dict): + output['loss'] /= 0 # make loss NaN + else: + output /= 0 + return output + + hparams = tutils.get_hparams() + model = NanModel(hparams) + + # fit model + trainer = Trainer( + default_save_path=tmpdir, + max_steps=10, + ) + + with pytest.raises(SystemExit, match=r".*The loss returned in `training_step` is NaN.*"): + trainer.fit(model) + + # if __name__ == '__main__': # pytest.main([__file__]) From b690eab4f9ccdb00901d0074bd31de93c62f6b6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 8 Mar 2020 01:18:08 +0100 Subject: [PATCH 03/22] sys.exit --- pytorch_lightning/trainer/training_loop.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 03c0ec6238890..50dc998292f91 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -745,21 +745,17 @@ def call_checkpoint_callback(self): def detect_nan(self, loss: Tensor) -> None: # check if loss is NaN if torch.any(torch.isnan(loss)): - warnings.warn( + sys.exit( 'The loss returned in `training_step` is NaN.' - ' Will stop training.', - UserWarning + ' Will stop training.' ) - sys.exit() - # check if a network weight is NaN (only the ones we optimize for) + # check if a network weight is NaN for name, param in self.model.named_parameters(): if torch.any(torch.isnan(param)): - warnings.warn( + sys.exit( f'Detected NaN values in `{name}`.' ' Check your forward pass for numerically unstable operations.' ' Will stop training.', - UserWarning ) - sys.exit() From bf75bbe8e5b46d830216322f53fb0c60c11a9c75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 8 Mar 2020 01:22:20 +0100 Subject: [PATCH 04/22] whitespace --- pytorch_lightning/trainer/training_loop.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 50dc998292f91..20665dda28b51 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -757,5 +757,3 @@ def detect_nan(self, loss: Tensor) -> None: ' Check your forward pass for numerically unstable operations.' ' Will stop training.', ) - - From aa59da09ab83f5085cad3f540e328506473dc868 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 9 Mar 2020 00:31:40 +0100 Subject: [PATCH 05/22] detect nan and inf values in loss and params --- pytorch_lightning/trainer/training_loop.py | 12 ++--- tests/test_cpu_models.py | 57 +++++++++++++++++----- 2 files changed, 51 insertions(+), 18 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 20665dda28b51..0caae8d443189 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -743,17 +743,17 @@ def call_checkpoint_callback(self): self.on_validation_end() def detect_nan(self, loss: Tensor) -> None: - # check if loss is NaN - if torch.any(torch.isnan(loss)): + # check if loss is nan + if not torch.isfinite(loss).all(): sys.exit( - 'The loss returned in `training_step` is NaN.' + 'The loss returned in `training_step` is nan or inf.' ' Will stop training.' ) - # check if a network weight is NaN + # check if a network weight is nan for name, param in self.model.named_parameters(): - if torch.any(torch.isnan(param)): + if not torch.isfinite(param).all(): sys.exit( - f'Detected NaN values in `{name}`.' + f'Detected nan and/or inf values in `{name}`.' ' Check your forward pass for numerically unstable operations.' ' Will stop training.', ) diff --git a/tests/test_cpu_models.py b/tests/test_cpu_models.py index ac8d4a9bca7d1..258e055862dfb 100644 --- a/tests/test_cpu_models.py +++ b/tests/test_cpu_models.py @@ -1,3 +1,4 @@ +import math import warnings import pytest @@ -360,32 +361,64 @@ def test_single_gpu_model(tmpdir): tutils.run_model_test(trainer_options, model) -def test_nan_detection(tmpdir): +def test_nan_loss_detection(tmpdir): + test_step = 8 - class NanModel(LightTrainDataloader, TestModelBase): - - def __init__(self, hparams): - super().__init__(hparams) + class InfLossModel(LightTrainDataloader, TestModelBase): def training_step(self, batch, batch_idx): output = super().training_step(batch, batch_idx) - if isinstance(output, dict): - output['loss'] /= 0 # make loss NaN - else: - output /= 0 + if batch_idx == test_step: + if isinstance(output, dict): + output['loss'] /= 0 # make loss infinite + else: + output /= 0 return output hparams = tutils.get_hparams() - model = NanModel(hparams) + model = InfLossModel(hparams) # fit model trainer = Trainer( default_save_path=tmpdir, - max_steps=10, + max_steps=(test_step + 1), + ) + + with pytest.raises(SystemExit, match=r'.*The loss returned in `training_step` is nan or inf.*'): + trainer.fit(model) + assert trainer.global_step == test_step + + for param in model.parameters(): + assert torch.isfinite(param).all() + + +def test_nan_params_detection(tmpdir): + test_step = 8 + + class NanParamModel(LightTrainDataloader, TestModelBase): + + def training_step(self, batch, batch_idx): + output = super().training_step(batch, batch_idx) + if batch_idx == test_step: + # simulate parameter that became nan + self.c_d1.bias[0] = torch.as_tensor(math.nan) + return output + + hparams = tutils.get_hparams() + + model = NanParamModel(hparams) + trainer = Trainer( + default_save_path=tmpdir, + max_steps=(test_step + 1), ) - with pytest.raises(SystemExit, match=r".*The loss returned in `training_step` is NaN.*"): + with pytest.raises(SystemExit, match=r'.*Detected nan and/or inf values in `c_d1.bias`.*'): trainer.fit(model) + assert trainer.global_step == test_step + + # after aborting the training loop, model still has nan-valued params + params = torch.cat([param.view(-1) for param in model.parameters()]) + assert not torch.isfinite(params).all() # if __name__ == '__main__': From 200c3b10053c3cd0c0c50088df39538cb9fe07e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 9 Mar 2020 00:49:24 +0100 Subject: [PATCH 06/22] update --- pytorch_lightning/trainer/training_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 0caae8d443189..6ebe5ce137d63 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -704,7 +704,7 @@ def training_forward(self, batch, batch_idx, opt_idx, hiddens): # format and reduce outputs accordingly output = self.process_output(output, train=True) - # check if loss or model weights are NaN + # check if loss or model weights are nan self.detect_nan(output[0]) return output From 7e1307aad935b16e40afc9339598c2318d6b7ca0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 9 Mar 2020 01:10:29 +0100 Subject: [PATCH 07/22] added documentation --- pytorch_lightning/trainer/training_loop.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 6ebe5ce137d63..71ca16d673617 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -119,6 +119,17 @@ def training_step(self, batch, batch_idx): trainer = Trainer(truncated_bptt_steps=2) +NaN detection and intervention +------------------------------ +In every forward pass in training, Lightning will check that + +1. the loss you return in `training_step` is finite (not NaN and not +/-inf) +2. the model parameters have finite values. + +Lightning will terminate the training loop with an error message if NaN or infinite +values are detected. If this happens, you should investigate numerically unstable operations +in your model. + """ import copy From fd31e7ba425f403b06b2910c87407c97328a7323 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 18 Mar 2020 00:23:25 +0100 Subject: [PATCH 08/22] moved detect nan to training loop, remove flag for print --- pytorch_lightning/trainer/training_loop.py | 28 +++----------------- pytorch_lightning/trainer/training_tricks.py | 24 ++++++++++++++++- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 71ca16d673617..d412219b5b1dc 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -201,7 +201,6 @@ class TrainerTrainLoopMixin(ABC): optimizers: ... accumulate_grad_batches: int use_amp: bool - print_nan_grads: ... track_grad_norm: ... model: LightningModule running_loss: ... @@ -214,7 +213,7 @@ class TrainerTrainLoopMixin(ABC): reload_dataloaders_every_epoch: bool progress_bar_refresh_rate: ... max_steps: int - max_steps: int + min_steps: int total_batch_idx: int checkpoint_callback: ... @@ -257,7 +256,7 @@ def clip_gradients(self): """Warning: this is just empty shell for code implemented in other class.""" @abstractmethod - def print_nan_gradients(self): + def detect_nan(self, *args): """Warning: this is just empty shell for code implemented in other class.""" @abstractmethod @@ -574,9 +573,8 @@ def optimizer_closure(): # calculate loss loss = optimizer_closure() - # nan grads - if self.print_nan_grads: - self.print_nan_gradients() + # check if loss or model weights are nan + self.detect_nan(loss) # track total loss for logging (avoid mem leaks) self.batch_loss_value += loss.item() @@ -715,9 +713,6 @@ def training_forward(self, batch, batch_idx, opt_idx, hiddens): # format and reduce outputs accordingly output = self.process_output(output, train=True) - # check if loss or model weights are nan - self.detect_nan(output[0]) - return output def update_learning_rates(self, interval): @@ -753,18 +748,3 @@ def call_checkpoint_callback(self): self.checkpoint_callback.on_validation_end(self, self.get_model()) self.on_validation_end() - def detect_nan(self, loss: Tensor) -> None: - # check if loss is nan - if not torch.isfinite(loss).all(): - sys.exit( - 'The loss returned in `training_step` is nan or inf.' - ' Will stop training.' - ) - # check if a network weight is nan - for name, param in self.model.named_parameters(): - if not torch.isfinite(param).all(): - sys.exit( - f'Detected nan and/or inf values in `{name}`.' - ' Check your forward pass for numerically unstable operations.' - ' Will stop training.', - ) diff --git a/pytorch_lightning/trainer/training_tricks.py b/pytorch_lightning/trainer/training_tricks.py index c468e1ba61c19..4d6969f56b221 100644 --- a/pytorch_lightning/trainer/training_tricks.py +++ b/pytorch_lightning/trainer/training_tricks.py @@ -1,8 +1,10 @@ import logging as log +import math +import sys from abc import ABC, abstractmethod import torch -import math +from torch import Tensor from pytorch_lightning.callbacks import GradientAccumulationScheduler @@ -15,6 +17,7 @@ class TrainerTrainingTricksMixin(ABC): # this is just a summary on variables used in this abstract class, # the proper values/initialisation should be done in child class gradient_clip_val: ... + precision: ... @abstractmethod def get_model(self): @@ -51,6 +54,25 @@ def print_nan_gradients(self): if (param.grad is not None) and torch.isnan(param.grad.float()).any(): log.info(param, param.grad) + def detect_nan(self, loss: Tensor) -> None: + model = self.get_model() + + # check if loss is nan + if not torch.isfinite(loss).all(): + sys.exit( + 'The loss returned in `training_step` is nan or inf.' + ' Will stop training.' + ) + # check if a network weight is nan + for name, param in model.named_parameters(): + if not torch.isfinite(param).all(): + self.print_nan_gradients() + sys.exit( + f'Detected nan and/or inf values in `{name}`.' + ' Check your forward pass for numerically unstable operations.' + ' Will stop training.', + ) + def configure_accumulated_gradients(self, accumulate_grad_batches): if isinstance(accumulate_grad_batches, dict): self.accumulation_scheduler = GradientAccumulationScheduler(accumulate_grad_batches) From a54be1f6caba739778d29f331fcc990f18c18399 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 18 Mar 2020 00:32:37 +0100 Subject: [PATCH 09/22] blank line --- pytorch_lightning/trainer/training_loop.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index d412219b5b1dc..c28e309b0e4e6 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -747,4 +747,3 @@ def call_checkpoint_callback(self): if self.checkpoint_callback is not None: self.checkpoint_callback.on_validation_end(self, self.get_model()) self.on_validation_end() - From f8a7aa01dedaf29316b6e2fe5d676714fadeff17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 18 Mar 2020 00:51:17 +0100 Subject: [PATCH 10/22] test --- tests/test_cpu_models.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/test_cpu_models.py b/tests/test_cpu_models.py index 258e055862dfb..52380d08b3efa 100644 --- a/tests/test_cpu_models.py +++ b/tests/test_cpu_models.py @@ -397,12 +397,10 @@ def test_nan_params_detection(tmpdir): class NanParamModel(LightTrainDataloader, TestModelBase): - def training_step(self, batch, batch_idx): - output = super().training_step(batch, batch_idx) - if batch_idx == test_step: + def on_after_backward(self): + if self.global_step == test_step: # simulate parameter that became nan - self.c_d1.bias[0] = torch.as_tensor(math.nan) - return output + torch.nn.init.constant_(self.c_d1.bias, math.nan) hparams = tutils.get_hparams() From 3aa478a7cfef24775d57cb3e7577c1c182f49755 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 18 Mar 2020 00:53:49 +0100 Subject: [PATCH 11/22] rename --- pytorch_lightning/trainer/training_loop.py | 4 ++-- pytorch_lightning/trainer/training_tricks.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index c28e309b0e4e6..b04d3fc5cfa6a 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -256,7 +256,7 @@ def clip_gradients(self): """Warning: this is just empty shell for code implemented in other class.""" @abstractmethod - def detect_nan(self, *args): + def detect_nan_tensors(self, *args): """Warning: this is just empty shell for code implemented in other class.""" @abstractmethod @@ -574,7 +574,7 @@ def optimizer_closure(): loss = optimizer_closure() # check if loss or model weights are nan - self.detect_nan(loss) + self.detect_nan_tensors(loss) # track total loss for logging (avoid mem leaks) self.batch_loss_value += loss.item() diff --git a/pytorch_lightning/trainer/training_tricks.py b/pytorch_lightning/trainer/training_tricks.py index 4d6969f56b221..a92a206b3a8a7 100644 --- a/pytorch_lightning/trainer/training_tricks.py +++ b/pytorch_lightning/trainer/training_tricks.py @@ -48,13 +48,13 @@ def clip_gradients(self): for p in parameters: p.grad.data.mul_(torch.where(clip_coef < 1, clip_coef, torch.tensor(1., device=device))) - def print_nan_gradients(self): + def print_nan_gradients(self) -> None: model = self.get_model() for param in model.parameters(): if (param.grad is not None) and torch.isnan(param.grad.float()).any(): log.info(param, param.grad) - def detect_nan(self, loss: Tensor) -> None: + def detect_nan_tensors(self, loss: Tensor) -> None: model = self.get_model() # check if loss is nan From c36e505c09e22d258fa83f199355a4fdf63c9361 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 18 Mar 2020 01:05:24 +0100 Subject: [PATCH 12/22] deprecate print_nan_grads --- pytorch_lightning/trainer/trainer.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 8c5a906fc3573..f726f109760ff 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -110,7 +110,7 @@ def __init__( distributed_backend: Optional[str] = None, use_amp=False, # backward compatible, todo: remove in v0.9.0 precision: int = 32, - print_nan_grads: bool = False, + print_nan_grads: bool = False, # backward compatible, todo: remove in v0.9.0 weights_summary: str = 'full', weights_save_path: Optional[str] = None, amp_level: str = 'O1', @@ -209,7 +209,10 @@ def __init__( precision: Full precision (32), half precision (16). - print_nan_grads: Prints gradients with nan values + print_nan_grads: + .. warning:: .. deprecated:: 0.7.0 + Has no effect. When detected, NaN grads will be printed automatically. + Will remove 0.9.0. weights_summary: Prints a summary of the weights when training begins. @@ -297,7 +300,12 @@ def __init__( "`num_sanity_val_steps` since v0.5.0" " and this method will be removed in v0.8.0", DeprecationWarning) self.nb_sanity_val_steps = nb_sanity_val_steps - self.print_nan_grads = print_nan_grads + + # Backward compatibility, TODO: remove in v0.9.0 + if print_nan_grads: + warnings.warn("Argument `print_nan_grads` has no effect and will be removed in v0.9.0." + " NaN grads will be printed automatically when detected.", DeprecationWarning) + self.truncated_bptt_steps = truncated_bptt_steps self.resume_from_checkpoint = resume_from_checkpoint self.shown_warnings = set() From 08b1a09ff1dd365c7b7304140c64a8a97c06c3e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 18 Mar 2020 01:07:01 +0100 Subject: [PATCH 13/22] deprecated print_nan_grads --- tests/test_cpu_models.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/test_cpu_models.py b/tests/test_cpu_models.py index 52380d08b3efa..eca31f4830cc7 100644 --- a/tests/test_cpu_models.py +++ b/tests/test_cpu_models.py @@ -28,7 +28,6 @@ def test_early_stopping_cpu_model(tmpdir): gradient_clip_val=1.0, overfit_pct=0.20, track_grad_norm=2, - print_nan_grads=True, show_progress_bar=True, logger=tutils.get_test_tube_logger(tmpdir), train_percent_check=0.1, @@ -50,7 +49,6 @@ def test_lbfgs_cpu_model(tmpdir): trainer_options = dict( default_save_path=tmpdir, max_epochs=2, - print_nan_grads=True, show_progress_bar=False, weights_summary='top', train_percent_check=1.0, @@ -70,7 +68,6 @@ def test_default_logger_callbacks_cpu_model(tmpdir): max_epochs=1, gradient_clip_val=1.0, overfit_pct=0.20, - print_nan_grads=True, show_progress_bar=False, train_percent_check=0.01, val_percent_check=0.01, @@ -253,7 +250,6 @@ def test_all_features_cpu_model(tmpdir): gradient_clip_val=1.0, overfit_pct=0.20, track_grad_norm=2, - print_nan_grads=True, show_progress_bar=False, logger=tutils.get_test_tube_logger(tmpdir), accumulate_grad_batches=2, From 8a6b3f38249b3a2f96b832cf13db38899b6a2e25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 18 Mar 2020 11:57:19 +0100 Subject: [PATCH 14/22] remove unused imports --- pytorch_lightning/trainer/training_loop.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 2fa5ea1dbd669..328170c11732b 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -133,16 +133,12 @@ def training_step(self, batch, batch_idx): """ import copy -import logging as log -import sys import warnings from abc import ABC, abstractmethod from typing import Callable from typing import Union, List import numpy as np -import torch -from torch import Tensor from torch.utils.data import DataLoader from pytorch_lightning import _logger as log From cb6c7b370859780427367228f6b1e3795b2b789b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 18 Mar 2020 12:03:17 +0100 Subject: [PATCH 15/22] update changelog --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 176b1826b3719..e206d5f98ccf3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added type hints to `pytorch_lightning.core` ([#946](https://github.com/PyTorchLightning/pytorch-lightning/pull/946)) - Added support for IterableDataset in validation and testing ([#1104](https://github.com/PyTorchLightning/pytorch-lightning/pull/1104)) - Added support for non-primitive types in hparams for TensorboardLogger ([#1130](https://github.com/PyTorchLightning/pytorch-lightning/pull/1130)) - +- Added a check that stops the training when loss or weights contain NaN or inf values. ([#1097](https://github.com/PyTorchLightning/pytorch-lightning/pull/1097)) ### Changed @@ -20,7 +20,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Deprecated -- +- Deprecated Trainer argument `print_nan_grads` ([#1097](https://github.com/PyTorchLightning/pytorch-lightning/pull/1097)) ### Removed From 701a0896ded6f7630cdbf4f2b56f7c6756f91e6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 18 Mar 2020 12:10:17 +0100 Subject: [PATCH 16/22] fix line too long --- pytorch_lightning/trainer/trainer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b264b6c973b35..89f169478f52d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -303,7 +303,8 @@ def __init__( # Backward compatibility, TODO: remove in v0.9.0 if print_nan_grads: warnings.warn("Argument `print_nan_grads` has no effect and will be removed in v0.9.0." - " NaN grads will be printed automatically when detected.", DeprecationWarning) + " NaN grads will be printed automatically when detected.", + DeprecationWarning) self.truncated_bptt_steps = truncated_bptt_steps self.resume_from_checkpoint = resume_from_checkpoint From 68f1007ed5c9deaf95675fe0f53524744e3dcc5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 18 Mar 2020 13:29:32 +0100 Subject: [PATCH 17/22] correct deprecated version Co-Authored-By: Jirka Borovec --- pytorch_lightning/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 89f169478f52d..b0179ab4db21e 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -209,7 +209,7 @@ def __init__( precision: Full precision (32), half precision (16). print_nan_grads: - .. warning:: .. deprecated:: 0.7.0 + .. warning:: .. deprecated:: 0.7.2 Has no effect. When detected, NaN grads will be printed automatically. Will remove 0.9.0. From 506803a3f055587b956ed8bbd9eeee4ca7ec0013 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 18 Mar 2020 13:30:00 +0100 Subject: [PATCH 18/22] raise exception instead of sysexit Co-Authored-By: Jirka Borovec --- pytorch_lightning/trainer/training_tricks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_tricks.py b/pytorch_lightning/trainer/training_tricks.py index c1db5a25b3156..c7598d95c8fbc 100644 --- a/pytorch_lightning/trainer/training_tricks.py +++ b/pytorch_lightning/trainer/training_tricks.py @@ -67,7 +67,7 @@ def detect_nan_tensors(self, loss: Tensor) -> None: for name, param in model.named_parameters(): if not torch.isfinite(param).all(): self.print_nan_gradients() - sys.exit( + raise ValueError( f'Detected nan and/or inf values in `{name}`.' ' Check your forward pass for numerically unstable operations.' ' Will stop training.', From 0412daab17891aa5a612c9db85eed531715fbecc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 18 Mar 2020 13:30:08 +0100 Subject: [PATCH 19/22] raise exception instead of sysexit Co-Authored-By: Jirka Borovec --- pytorch_lightning/trainer/training_tricks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_tricks.py b/pytorch_lightning/trainer/training_tricks.py index c7598d95c8fbc..51e73481d9321 100644 --- a/pytorch_lightning/trainer/training_tricks.py +++ b/pytorch_lightning/trainer/training_tricks.py @@ -59,7 +59,7 @@ def detect_nan_tensors(self, loss: Tensor) -> None: # check if loss is nan if not torch.isfinite(loss).all(): - sys.exit( + raise ValueError( 'The loss returned in `training_step` is nan or inf.' ' Will stop training.' ) From 893d7dd571f03aac58cce48a34a899bb53012bc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 18 Mar 2020 13:30:20 +0100 Subject: [PATCH 20/22] Update pytorch_lightning/trainer/training_tricks.py Co-Authored-By: Jirka Borovec --- pytorch_lightning/trainer/training_tricks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_tricks.py b/pytorch_lightning/trainer/training_tricks.py index 51e73481d9321..2fe025b5e9823 100644 --- a/pytorch_lightning/trainer/training_tricks.py +++ b/pytorch_lightning/trainer/training_tricks.py @@ -70,7 +70,6 @@ def detect_nan_tensors(self, loss: Tensor) -> None: raise ValueError( f'Detected nan and/or inf values in `{name}`.' ' Check your forward pass for numerically unstable operations.' - ' Will stop training.', ) def configure_accumulated_gradients(self, accumulate_grad_batches): From 0d9654fd68e356dd5eb64140c22f9088c38b336c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 18 Mar 2020 13:30:26 +0100 Subject: [PATCH 21/22] Update pytorch_lightning/trainer/training_tricks.py Co-Authored-By: Jirka Borovec --- pytorch_lightning/trainer/training_tricks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_tricks.py b/pytorch_lightning/trainer/training_tricks.py index 2fe025b5e9823..9dd43e193a2be 100644 --- a/pytorch_lightning/trainer/training_tricks.py +++ b/pytorch_lightning/trainer/training_tricks.py @@ -61,7 +61,6 @@ def detect_nan_tensors(self, loss: Tensor) -> None: if not torch.isfinite(loss).all(): raise ValueError( 'The loss returned in `training_step` is nan or inf.' - ' Will stop training.' ) # check if a network weight is nan for name, param in model.named_parameters(): From 6b21d11772d25e3e2d7e1ec7b080b6b267f3f490 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 18 Mar 2020 13:37:17 +0100 Subject: [PATCH 22/22] fix test --- tests/test_cpu_models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_cpu_models.py b/tests/test_cpu_models.py index eca31f4830cc7..38fc790430fd7 100644 --- a/tests/test_cpu_models.py +++ b/tests/test_cpu_models.py @@ -366,7 +366,7 @@ def training_step(self, batch, batch_idx): output = super().training_step(batch, batch_idx) if batch_idx == test_step: if isinstance(output, dict): - output['loss'] /= 0 # make loss infinite + output['loss'] *= torch.tensor(math.inf) # make loss infinite else: output /= 0 return output @@ -380,7 +380,7 @@ def training_step(self, batch, batch_idx): max_steps=(test_step + 1), ) - with pytest.raises(SystemExit, match=r'.*The loss returned in `training_step` is nan or inf.*'): + with pytest.raises(ValueError, match=r'.*The loss returned in `training_step` is nan or inf.*'): trainer.fit(model) assert trainer.global_step == test_step @@ -406,7 +406,7 @@ def on_after_backward(self): max_steps=(test_step + 1), ) - with pytest.raises(SystemExit, match=r'.*Detected nan and/or inf values in `c_d1.bias`.*'): + with pytest.raises(ValueError, match=r'.*Detected nan and/or inf values in `c_d1.bias`.*'): trainer.fit(model) assert trainer.global_step == test_step