From bc8c5ee36fed6ebd46810421bbae94a1ae636901 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Mon, 27 Apr 2020 10:07:39 +0200 Subject: [PATCH 01/12] Fraceful shutdown on python interpreter exit --- pytorch_lightning/trainer/training_loop.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 55c63679ae9f6..3929f673090f6 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -144,6 +144,7 @@ def training_step(self, batch, batch_idx): from abc import ABC, abstractmethod from typing import Callable from typing import Union, List +import atexit import numpy as np from torch.utils.data import DataLoader @@ -663,7 +664,10 @@ def _get_optimizers_iterable(self): opt_idx = np.argmax(optimizer_freq_cumsum > current_place_in_loop) return [(opt_idx, self.optimizers[opt_idx])] + @atexit.register def run_training_teardown(self): + if hasattr('_teardown_already_run') and self._teardown_already_run: + return # Train end events with self.profiler.profile('on_train_end'): # callbacks @@ -677,6 +681,8 @@ def run_training_teardown(self): # summarize profile results self.profiler.describe() + + self._teardown_already_run = True def training_forward(self, batch, batch_idx, opt_idx, hiddens): """ From ebfb51e17df0dc2ef8aa84762d7c43498e5cb331 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Mon, 27 Apr 2020 10:11:20 +0200 Subject: [PATCH 02/12] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b6280a1beeab6..22d802dd5dc76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed unintended Trainer argument `progress_bar_callback`, the callback should be passed in by `Trainer(callbacks=[...])` instead ([#1855](https://github.com/PyTorchLightning/pytorch-lightning/pull/1855)) ### Fixed +- Run graceful training teardown on interpreter exit ([#1631](https://github.com/PyTorchLightning/pytorch-lightning/pull/1631)) - Fixed user warning when apex was used together with learning rate schedulers ([#1873](https://github.com/PyTorchLightning/pytorch-lightning/pull/1873)) From d6e6db6935d94584c507d2c097d5850e5202aec9 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Mon, 27 Apr 2020 10:12:58 +0200 Subject: [PATCH 03/12] Update training_loop.py --- pytorch_lightning/trainer/training_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 3929f673090f6..66755a13714ac 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -681,7 +681,7 @@ def run_training_teardown(self): # summarize profile results self.profiler.describe() - + self._teardown_already_run = True def training_forward(self, batch, batch_idx, opt_idx, hiddens): From adc6c7c957a91c2db4647b6392b76c074f59dee8 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Mon, 27 Apr 2020 10:50:42 +0200 Subject: [PATCH 04/12] Update training_loop.py --- pytorch_lightning/trainer/training_loop.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 66755a13714ac..bdea0ccc88c00 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -145,6 +145,7 @@ def training_step(self, batch, batch_idx): from typing import Callable from typing import Union, List import atexit +import signal import numpy as np from torch.utils.data import DataLoader @@ -372,6 +373,10 @@ def train(self): self.run_training_teardown() + # reset signal handlers + for sig_name in sig_names: + signal.signal(getattr(signal, sig_name), orig_signal_handlers[sig_name]) + except KeyboardInterrupt: if self.proc_rank == 0: log.info('Detected KeyboardInterrupt, attempting graceful shutdown...') From 5584370d421d022bf884e7cb69ac553b153bba5d Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Mon, 27 Apr 2020 11:34:19 +0200 Subject: [PATCH 05/12] Update CHANGELOG.md Co-Authored-By: Jirka Borovec --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 22d802dd5dc76..31b829c61c7a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed unintended Trainer argument `progress_bar_callback`, the callback should be passed in by `Trainer(callbacks=[...])` instead ([#1855](https://github.com/PyTorchLightning/pytorch-lightning/pull/1855)) ### Fixed + - Run graceful training teardown on interpreter exit ([#1631](https://github.com/PyTorchLightning/pytorch-lightning/pull/1631)) - Fixed user warning when apex was used together with learning rate schedulers ([#1873](https://github.com/PyTorchLightning/pytorch-lightning/pull/1873)) From e799f16f33a54380203de33d639eed29954efa91 Mon Sep 17 00:00:00 2001 From: Justus Schock Date: Mon, 27 Apr 2020 11:39:14 +0200 Subject: [PATCH 06/12] pep8, move to constant --- pytorch_lightning/trainer/training_loop.py | 24 ++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index bdea0ccc88c00..eb7f0f78293b9 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -141,23 +141,23 @@ def training_step(self, batch, batch_idx): """ +import atexit +import signal from abc import ABC, abstractmethod from typing import Callable from typing import Union, List -import atexit -import signal import numpy as np -from torch.utils.data import DataLoader import torch +from torch.utils.data import DataLoader from pytorch_lightning import _logger as log from pytorch_lightning.callbacks.base import Callback from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.loggers import LightningLoggerBase -from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.trainer.supporters import TensorRunningAccum from pytorch_lightning.utilities import rank_zero_warn +from pytorch_lightning.utilities.exceptions import MisconfigurationException try: from apex import amp @@ -181,9 +181,11 @@ def training_step(self, batch, batch_idx): else: HOROVOD_AVAILABLE = True +# constant which signals should be catched for graceful trainer shutdown +SIGNAL_TERMINATE = ('SIGTERM', 'SIGKILL', 'SIGSEGV', 'SIGINT') -class TrainerTrainLoopMixin(ABC): +class TrainerTrainLoopMixin(ABC): # this is just a summary on variables used in this abstract class, # the proper values/initialisation should be done in child class max_epochs: int @@ -302,6 +304,12 @@ def has_arg(self, *args): """Warning: this is just empty shell for code implemented in other class.""" def train(self): + # add signal handlers for process kills + orig_signal_handlers = {} + for sig_name in SIGNAL_TERMINATE: + orig_signal_handlers[sig_name] = signal.signal(getattr(signal, sig_name), + self.run_training_teardown) + # get model model = self.get_model() @@ -374,8 +382,8 @@ def train(self): self.run_training_teardown() # reset signal handlers - for sig_name in sig_names: - signal.signal(getattr(signal, sig_name), orig_signal_handlers[sig_name]) + for sig_name in SIGNAL_TERMINATE: + signal.signal(getattr(signal, sig_name), orig_signal_handlers[sig_name]) except KeyboardInterrupt: if self.proc_rank == 0: @@ -411,7 +419,7 @@ def run_training_epoch(self): # run epoch for batch_idx, (batch, is_last_batch) in self.profiler.profile_iterable( - enumerate(_with_is_last(train_dataloader)), "get_train_batch" + enumerate(_with_is_last(train_dataloader)), "get_train_batch" ): # stop epoch if we limited the number of training batches if batch_idx >= self.num_training_batches: From c075610e3e3b4f9531936d795e8b3b7cf468d5c8 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Mon, 27 Apr 2020 12:33:03 +0200 Subject: [PATCH 07/12] Update training_loop.py --- pytorch_lightning/trainer/training_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index eb7f0f78293b9..d40e10b911e35 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -678,7 +678,7 @@ def _get_optimizers_iterable(self): return [(opt_idx, self.optimizers[opt_idx])] @atexit.register - def run_training_teardown(self): + def run_training_teardown(self, signum=None, frame=None): if hasattr('_teardown_already_run') and self._teardown_already_run: return # Train end events From 388bd123eeba1ff1ee79391172c8aea7214e8b5e Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Mon, 27 Apr 2020 15:49:01 +0200 Subject: [PATCH 08/12] Update training_loop.py --- pytorch_lightning/trainer/training_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index d40e10b911e35..54466c5bdbf60 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -679,7 +679,7 @@ def _get_optimizers_iterable(self): @atexit.register def run_training_teardown(self, signum=None, frame=None): - if hasattr('_teardown_already_run') and self._teardown_already_run: + if hasattr(self, '_teardown_already_run') and self._teardown_already_run: return # Train end events with self.profiler.profile('on_train_end'): From 3d3f5f321b1f9bcb7bf91a45cf7d8d39d6c83ab8 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Mon, 27 Apr 2020 16:45:29 +0200 Subject: [PATCH 09/12] Update training_loop.py --- pytorch_lightning/trainer/training_loop.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 54466c5bdbf60..7b49ac20b9826 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -146,6 +146,7 @@ def training_step(self, batch, batch_idx): from abc import ABC, abstractmethod from typing import Callable from typing import Union, List +from functools import partial import numpy as np import torch @@ -308,7 +309,8 @@ def train(self): orig_signal_handlers = {} for sig_name in SIGNAL_TERMINATE: orig_signal_handlers[sig_name] = signal.signal(getattr(signal, sig_name), - self.run_training_teardown) + partial(_signal_kill_handler, + self=self) # get model model = self.get_model() @@ -678,7 +680,7 @@ def _get_optimizers_iterable(self): return [(opt_idx, self.optimizers[opt_idx])] @atexit.register - def run_training_teardown(self, signum=None, frame=None): + def run_training_teardown(self): if hasattr(self, '_teardown_already_run') and self._teardown_already_run: return # Train end events @@ -826,3 +828,8 @@ def _with_is_last(iterable): last = val # yield last, no longer has next yield last, True + + +def _signal_kill_handler(signum, frame, *args, **kwargs): + return TrainerTrainLoopMixin.run_training_teardown(*args, **kwargs) + From 705d9037e47fb6a6191dde1df2b0341ae368342d Mon Sep 17 00:00:00 2001 From: Justus Schock Date: Mon, 27 Apr 2020 17:21:32 +0200 Subject: [PATCH 10/12] pep8, move to constant --- pytorch_lightning/trainer/training_loop.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 7b49ac20b9826..437b0a1d3ee33 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -144,9 +144,9 @@ def training_step(self, batch, batch_idx): import atexit import signal from abc import ABC, abstractmethod +from functools import partial from typing import Callable from typing import Union, List -from functools import partial import numpy as np import torch @@ -183,7 +183,7 @@ def training_step(self, batch, batch_idx): HOROVOD_AVAILABLE = True # constant which signals should be catched for graceful trainer shutdown -SIGNAL_TERMINATE = ('SIGTERM', 'SIGKILL', 'SIGSEGV', 'SIGINT') +SIGNAL_TERMINATE = ('SIGTERM', 'SIGSEGV', 'SIGINT') class TrainerTrainLoopMixin(ABC): @@ -306,11 +306,12 @@ def has_arg(self, *args): def train(self): # add signal handlers for process kills + def _signal_kill_handler(*args): + return TrainerTrainLoopMixin.run_training_teardown(self) orig_signal_handlers = {} for sig_name in SIGNAL_TERMINATE: orig_signal_handlers[sig_name] = signal.signal(getattr(signal, sig_name), - partial(_signal_kill_handler, - self=self) + _signal_kill_handler) # get model model = self.get_model() @@ -829,7 +830,3 @@ def _with_is_last(iterable): # yield last, no longer has next yield last, True - -def _signal_kill_handler(signum, frame, *args, **kwargs): - return TrainerTrainLoopMixin.run_training_teardown(*args, **kwargs) - From 8ff10c8782619fe5132adb39eb1416df0858f66f Mon Sep 17 00:00:00 2001 From: Justus Schock Date: Mon, 27 Apr 2020 17:46:17 +0200 Subject: [PATCH 11/12] pep8 --- pytorch_lightning/trainer/training_loop.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 437b0a1d3ee33..bd09d8252a5cc 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -144,7 +144,6 @@ def training_step(self, batch, batch_idx): import atexit import signal from abc import ABC, abstractmethod -from functools import partial from typing import Callable from typing import Union, List @@ -308,6 +307,7 @@ def train(self): # add signal handlers for process kills def _signal_kill_handler(*args): return TrainerTrainLoopMixin.run_training_teardown(self) + orig_signal_handlers = {} for sig_name in SIGNAL_TERMINATE: orig_signal_handlers[sig_name] = signal.signal(getattr(signal, sig_name), @@ -829,4 +829,3 @@ def _with_is_last(iterable): last = val # yield last, no longer has next yield last, True - From 27b027756817c3e3b8a06d9df1972df5c436d7b0 Mon Sep 17 00:00:00 2001 From: Jirka Date: Tue, 12 May 2020 10:15:38 +0200 Subject: [PATCH 12/12] timeout --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 2237e39423bb0..1cd6ac7a4d27a 100755 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -22,7 +22,7 @@ references: command: | python --version ; pip --version ; pip list py.test pytorch_lightning tests -v --doctest-modules --junitxml=test-reports/pytest_junit.xml - no_output_timeout: 30m + no_output_timeout: 15m examples: &examples run: