From 20f32b450fe9d2d1ffc58bcd57e0eadb539fa5c4 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Tue, 18 Feb 2020 10:06:02 +0100 Subject: [PATCH 01/28] remove deprecated args to learning rate step function --- pytorch_lightning/trainer/training_loop.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index d8bdb3f49c585..aa52b91f407e8 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -362,7 +362,7 @@ def train(self): # update LR schedulers if self.lr_schedulers is not None: for lr_scheduler in self.lr_schedulers: - lr_scheduler.step(epoch=self.current_epoch) + lr_scheduler.step() if self.reduce_lr_on_plateau_scheduler is not None: val_loss = self.callback_metrics.get('val_loss') if val_loss is None: @@ -370,7 +370,7 @@ def train(self): m = f'ReduceLROnPlateau conditioned on metric val_loss ' \ f'which is not available. Available metrics are: {avail_metrics}' raise MisconfigurationException(m) - self.reduce_lr_on_plateau_scheduler.step(val_loss, epoch=self.current_epoch) + self.reduce_lr_on_plateau_scheduler.step(val_loss) # early stopping met_min_epochs = epoch >= self.min_epochs - 1 From ef237f1f4b03928137bde7eb34a457d4a10f411a Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Tue, 25 Feb 2020 10:50:23 +0100 Subject: [PATCH 02/28] step based scheduler --- pytorch_lightning/trainer/trainer.py | 35 ++++++++++++--- pytorch_lightning/trainer/training_loop.py | 50 +++++++++++++++------- 2 files changed, 63 insertions(+), 22 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index d2395ee566b0c..74eb245e84247 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -876,25 +876,46 @@ def fit(self, model): return 1 def init_optimizers(self, optimizers): - # single optimizer + # single output, single optimizer if isinstance(optimizers, Optimizer): return [optimizers], [] - # two lists - if len(optimizers) == 2 and isinstance(optimizers[0], list): + # two lists, optimizer + lr schedulers + elif len(optimizers) == 2 and isinstance(optimizers[0], list): optimizers, lr_schedulers = optimizers lr_schedulers, self.reduce_lr_on_plateau_scheduler = self.configure_schedulers(lr_schedulers) return optimizers, lr_schedulers - - # single list or tuple - if isinstance(optimizers, (list, tuple)): + + # single list or tuple, multiple optimizer + elif isinstance(optimizers, (list, tuple)): return optimizers, [] - + + # unknown configuration + else: + raise ValueError('Unknown configuration for model optimizers. Output' + 'from model.configure_optimizers() should either be:' + '* single output, single torch.optim.Optimizer' + '* single output, list of torch.optim.Optimizer' + '* two outputs, first being a list of torch.optim.Optimizer', + 'second being a list of torch.optim.lr_scheduler') + def configure_schedulers(self, schedulers): + # Determine number of steps for each scheduler + self.lr_steps = [ ] + for i, scheduler in enumerate(schedulers): + if isinstance(scheduler, (list, tuple)): + self.lr_steps.append(scheduler[1]) + schedulers[i] = scheduler[0] + else: + self.lr_steps.append(None) + + # Special case if scheduler is ReduceLROnPlateau for i, scheduler in enumerate(schedulers): if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): reduce_lr_on_plateau_scheduler = schedulers.pop(i) + self.lr_step_reduce_on_plateau = self.lr_steps.pop(i) return schedulers, reduce_lr_on_plateau_scheduler + return schedulers, None def run_pretrain_routine(self, model): diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index aa52b91f407e8..62808c1928180 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -358,20 +358,10 @@ def train(self): # RUN TNG EPOCH # ----------------- self.run_training_epoch() - - # update LR schedulers - if self.lr_schedulers is not None: - for lr_scheduler in self.lr_schedulers: - lr_scheduler.step() - if self.reduce_lr_on_plateau_scheduler is not None: - val_loss = self.callback_metrics.get('val_loss') - if val_loss is None: - avail_metrics = ','.join(list(self.callback_metrics.keys())) - m = f'ReduceLROnPlateau conditioned on metric val_loss ' \ - f'which is not available. Available metrics are: {avail_metrics}' - raise MisconfigurationException(m) - self.reduce_lr_on_plateau_scheduler.step(val_loss) - + + # update lr + self.update_lr() + # early stopping met_min_epochs = epoch >= self.min_epochs - 1 if (self.enable_early_stop and not self.disable_validation and is_val_epoch and @@ -430,7 +420,10 @@ def run_training_epoch(self): # when returning -1 from train_step, we end epoch early early_stop_epoch = batch_result == -1 - + + # update lr + self.update_lr(current_step=self.total_batch_idx) + # --------------- # RUN VAL STEP # --------------- @@ -670,3 +663,30 @@ def training_forward(self, batch, batch_idx, opt_idx, hiddens): output = self.process_output(output, train=True) return output + + def update_lr(self, current_step=None): + # update LR schedulers + if self.lr_schedulers is not None: + for lr_scheduler, lr_step in zip(self.lr_schedulers, self.lr_steps): + if current_step and lr_step: + if current_step % lr_step == 0: + lr_scheduler.step() + # if user did not supply lr_step, then this is invoked after after each epoch + elif current_step==None and lr_step==None: + lr_scheduler.step() + + if self.reduce_lr_on_plateau_scheduler is not None: + val_loss = self.callback_metrics.get('val_loss') + if val_loss is None: + avail_metrics = ','.join(list(self.callback_metrics.keys())) + m = f'ReduceLROnPlateau conditioned on metric val_loss ' \ + f'which is not available. Available metrics are: {avail_metrics}' + raise MisconfigurationException(m) + + if current_step and self.lr_step_reduce_on_plateau: + if current_step % self.lr_step_reduce_on_plateau == 0: + self.reduce_lr_on_plateau_scheduler.step(val_loss) + print('LR UPDATE') + elif current_step==None and lr_step==None: + self.reduce_lr_on_plateau_scheduler.step(val_loss) + print('LR UPDATE') From 67ae533efe6dfec2c3f4bcc5601bdd743e37dc6b Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Tue, 25 Feb 2020 10:50:50 +0100 Subject: [PATCH 03/28] mixing models for testing --- tests/models/mixins.py | 48 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/tests/models/mixins.py b/tests/models/mixins.py index 03da85d59d096..1c2cad0e9ac1c 100644 --- a/tests/models/mixins.py +++ b/tests/models/mixins.py @@ -1,7 +1,7 @@ from collections import OrderedDict import torch - +from torch import optim from pytorch_lightning.core.decorators import data_loader @@ -379,3 +379,49 @@ def test_end(self, outputs): tqdm_dict = {'test_loss': test_loss_mean.item(), 'test_acc': test_acc_mean.item()} result = {'progress_bar': tqdm_dict} return result + +class LightningTestOptimizerWithSchedulingMixin: + def configure_optimizers(self): + if self.hparams.optimizer_name == 'lbfgs': + optimizer = optim.LBFGS(self.parameters(), lr=self.hparams.learning_rate) + else: + optimizer = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) + lr_scheduling = optim.lr_scheduling.CosineAnnealingLR(optimizer, 10) + return [optimizer], [lr_scheduling] + +class LightningTestMultipleOptimizersMixin: + def configure_optimizers(self): + if self.hparams.optimizer_name == 'lbfgs': + optimizer1 = optim.LBFGS(self.parameters(), lr=self.hparams.learning_rate) + optimizer2 = optim.LBFGS(self.parameters(), lr=self.hparams.learning_rate) + else: + optimizer1 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) + optimizer2 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) + return [optimizer1, optimizer2] + +class LightningTestMultipleOptimizersWithSchedulingMixin: + def configure_optimizers(self): + if self.hparams.optimizer_name == 'lbfgs': + optimizer1 = optim.LBFGS(self.parameters(), lr=self.hparams.learning_rate) + optimizer2 = optim.LBFGS(self.parameters(), lr=self.hparams.learning_rate) + else: + optimizer1 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) + optimizer2 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) + lr_scheduling1 = optim.lr_scheduling.CosineAnnealingLR(optimizer1, 10) + lr_scheduling2 = optim.lr_scheduling.CosineAnnealingLR(optimizer2, 10) + + return [optimizer1, optimizer2], [lr_scheduling1, lr_scheduling2] + +class LightningTestOptimizersWithSchedulingAndStepsMixin: + def configure_optimizers(self): + if self.hparams.optimizer_name == 'lbfgs': + optimizer1 = optim.LBFGS(self.parameters(), lr=self.hparams.learning_rate) + optimizer2 = optim.LBFGS(self.parameters(), lr=self.hparams.learning_rate) + else: + optimizer1 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) + optimizer2 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) + lr_scheduling1 = optim.lr_scheduling.CosineAnnealingLR(optimizer1, 10) + lr_scheduling2 = optim.lr_scheduling.CosineAnnealingLR(optimizer2, 10) + + return [optimizer1, optimizer2], [lr_scheduling1, [lr_scheduling2, 10]] + \ No newline at end of file From e640403be10925679367f0a57109b7d02fe7ba62 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Tue, 25 Feb 2020 11:06:04 +0100 Subject: [PATCH 04/28] fix styling --- pytorch_lightning/trainer/trainer.py | 10 +++++----- pytorch_lightning/trainer/training_loop.py | 17 ++++++++--------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 7b87502bb5f87..71db9461bd519 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -994,11 +994,11 @@ def init_optimizers( optimizers, lr_schedulers = optimizers lr_schedulers, self.reduce_lr_on_plateau_scheduler = self.configure_schedulers(lr_schedulers) return optimizers, lr_schedulers - + # single list or tuple, multiple optimizer elif isinstance(optimizers, (list, tuple)): return optimizers, [] - + # unknown configuration else: raise ValueError('Unknown configuration for model optimizers. Output' @@ -1007,17 +1007,17 @@ def init_optimizers( '* single output, list of torch.optim.Optimizer' '* two outputs, first being a list of torch.optim.Optimizer', 'second being a list of torch.optim.lr_scheduler') - + def configure_schedulers(self, schedulers: list): # Determine number of steps for each scheduler - self.lr_steps = [ ] + self.lr_steps = [] for i, scheduler in enumerate(schedulers): if isinstance(scheduler, (list, tuple)): self.lr_steps.append(scheduler[1]) schedulers[i] = scheduler[0] else: self.lr_steps.append(None) - + # Special case if scheduler is ReduceLROnPlateau for i, scheduler in enumerate(schedulers): if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 8b345627cf421..2dd96c858695e 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -429,10 +429,10 @@ def run_training_epoch(self): # when returning -1 from train_step, we end epoch early early_stop_epoch = batch_result == -1 - + # update lr self.update_lr(current_step=self.total_batch_idx) - + # --------------- # RUN VAL STEP # --------------- @@ -688,7 +688,7 @@ def training_forward(self, batch, batch_idx, opt_idx, hiddens): output = self.process_output(output, train=True) return output - + def update_lr(self, current_step=None): # update LR schedulers if self.lr_schedulers is not None: @@ -697,9 +697,9 @@ def update_lr(self, current_step=None): if current_step % lr_step == 0: lr_scheduler.step() # if user did not supply lr_step, then this is invoked after after each epoch - elif current_step==None and lr_step==None: + elif current_step is None and lr_step is None: lr_scheduler.step() - + if self.reduce_lr_on_plateau_scheduler is not None: val_loss = self.callback_metrics.get('val_loss') if val_loss is None: @@ -707,11 +707,10 @@ def update_lr(self, current_step=None): m = f'ReduceLROnPlateau conditioned on metric val_loss ' \ f'which is not available. Available metrics are: {avail_metrics}' raise MisconfigurationException(m) - + if current_step and self.lr_step_reduce_on_plateau: if current_step % self.lr_step_reduce_on_plateau == 0: self.reduce_lr_on_plateau_scheduler.step(val_loss) - print('LR UPDATE') - elif current_step==None and lr_step==None: + + elif current_step is None and lr_step is None: self.reduce_lr_on_plateau_scheduler.step(val_loss) - print('LR UPDATE') From 2e674e82b432f9fe930aaf9c1c9fd75ccd3b2995 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Tue, 25 Feb 2020 16:03:21 +0100 Subject: [PATCH 05/28] tests --- tests/models/__init__.py | 5 +- tests/models/base.py | 2 +- tests/models/mixins.py | 36 +++++------ tests/test_trainer.py | 131 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 149 insertions(+), 25 deletions(-) diff --git a/tests/models/__init__.py b/tests/models/__init__.py index df16bffd668b8..3b2a38dd7c501 100644 --- a/tests/models/__init__.py +++ b/tests/models/__init__.py @@ -15,7 +15,10 @@ LightningTestFitSingleTestDataloadersMixin, LightningTestFitMultipleTestDataloadersMixin, LightningValStepFitSingleDataloaderMixin, - LightningValStepFitMultipleDataloadersMixin + LightningValStepFitMultipleDataloadersMixin, + LightningTestOptimizerWithSchedulingMixin, + LightningTestMultipleOptimizersWithSchedulingMixin, + LightningTestOptimizersWithSchedulingAndStepsMixin ) diff --git a/tests/models/base.py b/tests/models/base.py index 2c17402624d63..053c079e8c460 100644 --- a/tests/models/base.py +++ b/tests/models/base.py @@ -102,7 +102,7 @@ def loss(self, labels, logits): nll = F.nll_loss(logits, labels) return nll - def training_step(self, batch, batch_idx): + def training_step(self, batch, batch_idx, optimizer_idx=None): """ Lightning calls this inside the training loop :param batch: diff --git a/tests/models/mixins.py b/tests/models/mixins.py index ff107d5827f08..313d78a3fd17a 100644 --- a/tests/models/mixins.py +++ b/tests/models/mixins.py @@ -568,25 +568,17 @@ def test_end(self, outputs): result = {'progress_bar': tqdm_dict} return result + class LightningTestOptimizerWithSchedulingMixin: def configure_optimizers(self): if self.hparams.optimizer_name == 'lbfgs': optimizer = optim.LBFGS(self.parameters(), lr=self.hparams.learning_rate) else: optimizer = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) - lr_scheduling = optim.lr_scheduling.CosineAnnealingLR(optimizer, 10) - return [optimizer], [lr_scheduling] + lr_scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1) + return [optimizer], [lr_scheduler] + -class LightningTestMultipleOptimizersMixin: - def configure_optimizers(self): - if self.hparams.optimizer_name == 'lbfgs': - optimizer1 = optim.LBFGS(self.parameters(), lr=self.hparams.learning_rate) - optimizer2 = optim.LBFGS(self.parameters(), lr=self.hparams.learning_rate) - else: - optimizer1 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) - optimizer2 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) - return [optimizer1, optimizer2] - class LightningTestMultipleOptimizersWithSchedulingMixin: def configure_optimizers(self): if self.hparams.optimizer_name == 'lbfgs': @@ -595,11 +587,12 @@ def configure_optimizers(self): else: optimizer1 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) optimizer2 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) - lr_scheduling1 = optim.lr_scheduling.CosineAnnealingLR(optimizer1, 10) - lr_scheduling2 = optim.lr_scheduling.CosineAnnealingLR(optimizer2, 10) - - return [optimizer1, optimizer2], [lr_scheduling1, lr_scheduling2] - + lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, 1, gamma=0.1) + lr_scheduler2 = optim.lr_scheduler.StepLR(optimizer2, 1, gamma=0.1) + + return [optimizer1, optimizer2], [lr_scheduler1, lr_scheduler2] + + class LightningTestOptimizersWithSchedulingAndStepsMixin: def configure_optimizers(self): if self.hparams.optimizer_name == 'lbfgs': @@ -608,8 +601,7 @@ def configure_optimizers(self): else: optimizer1 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) optimizer2 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) - lr_scheduling1 = optim.lr_scheduling.CosineAnnealingLR(optimizer1, 10) - lr_scheduling2 = optim.lr_scheduling.CosineAnnealingLR(optimizer2, 10) - - return [optimizer1, optimizer2], [lr_scheduling1, [lr_scheduling2, 10]] - \ No newline at end of file + lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, 1, gamma=0.1) + lr_scheduler2 = optim.lr_scheduler.StepLR(optimizer2, 1, gamma=0.1) + + return [optimizer1, optimizer2], [lr_scheduler1, [lr_scheduler2, 3]] diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 595dc4255392e..8e4a143c2a584 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -20,7 +20,10 @@ LightningTestFitSingleTestDataloadersMixin, LightningTestFitMultipleTestDataloadersMixin, LightningValStepFitMultipleDataloadersMixin, - LightningValStepFitSingleDataloaderMixin + LightningValStepFitSingleDataloaderMixin, + LightningTestOptimizerWithSchedulingMixin, + LightningTestMultipleOptimizersWithSchedulingMixin, + LightningTestOptimizersWithSchedulingAndStepsMixin ) from pytorch_lightning.core.lightning import load_hparams_from_tags_csv from pytorch_lightning.trainer.logging import TrainerLoggingMixin @@ -816,5 +819,131 @@ def test_dataloader(self): Trainer().test(model) +def test_optimizer_with_scheduling(tmpdir): + """ Verify that learning rate scheduling is working """ + tutils.reset_seed() + + class CurrentTestModel( + LightningTestOptimizerWithSchedulingMixin, + LightningTestModelBase): + pass + + hparams = tutils.get_hparams() + model = CurrentTestModel(hparams) + + # logger file to get meta + trainer_options = dict( + default_save_path=tmpdir, + max_epochs=1, + val_percent_check=0.1, + train_percent_check=0.2 + ) + + # fit model + trainer = Trainer(**trainer_options) + results = trainer.fit(model) + + init_lr = hparams.learning_rate + adjusted_lr = [pg['lr'] for pg in trainer.optimizers[0].param_groups] + + assert len(trainer.lr_schedulers) == 1, \ + 'lr scheduler not initialized properly' + + assert all(a == adjusted_lr[0] for a in adjusted_lr), \ + 'lr not equally adjusted for all param groups' + adjusted_lr = adjusted_lr[0] + + assert init_lr * 0.1 == adjusted_lr, \ + 'lr not adjusted correctly' + + +def test_multi_optimizer_with_scheduling(tmpdir): + """ Verify that learning rate scheduling is working """ + tutils.reset_seed() + + class CurrentTestModel( + LightningTestMultipleOptimizersWithSchedulingMixin, + LightningTestModelBase): + pass + + hparams = tutils.get_hparams() + model = CurrentTestModel(hparams) + + # logger file to get meta + trainer_options = dict( + default_save_path=tmpdir, + max_epochs=1, + val_percent_check=0.1, + train_percent_check=0.2 + ) + + # fit model + trainer = Trainer(**trainer_options) + results = trainer.fit(model) + + init_lr = hparams.learning_rate + adjusted_lr1 = [pg['lr'] for pg in trainer.optimizers[0].param_groups] + adjusted_lr2 = [pg['lr'] for pg in trainer.optimizers[1].param_groups] + + assert len(trainer.lr_schedulers) == 2, \ + 'all lr scheduler not initialized properly' + + assert all(a == adjusted_lr1[0] for a in adjusted_lr1), \ + 'lr not equally adjusted for all param groups for optimizer 1' + adjusted_lr1 = adjusted_lr1[0] + + assert all(a == adjusted_lr2[0] for a in adjusted_lr2), \ + 'lr not equally adjusted for all param groups for optimizer 2' + adjusted_lr2 = adjusted_lr2[0] + + assert init_lr * 0.1 == adjusted_lr1 and init_lr * 0.1 == adjusted_lr2, \ + 'lr not adjusted correctly' + + +def test_multi_optimizer_with_scheduling_stepping(tmpdir): + tutils.reset_seed() + + class CurrentTestModel( + LightningTestOptimizersWithSchedulingAndStepsMixin, + LightningTestModelBase): + pass + + hparams = tutils.get_hparams() + model = CurrentTestModel(hparams) + + # logger file to get meta + trainer_options = dict( + default_save_path=tmpdir, + max_epochs=1, + val_percent_check=0.1, + train_percent_check=0.2 + ) + + # fit model + trainer = Trainer(**trainer_options) + results = trainer.fit(model) + + init_lr = hparams.learning_rate + adjusted_lr1 = [pg['lr'] for pg in trainer.optimizers[0].param_groups] + adjusted_lr2 = [pg['lr'] for pg in trainer.optimizers[1].param_groups] + + assert len(trainer.lr_schedulers) == 2, \ + 'all lr scheduler not initialized properly' + + assert all(a == adjusted_lr1[0] for a in adjusted_lr1), \ + 'lr not equally adjusted for all param groups for optimizer 1' + adjusted_lr1 = adjusted_lr1[0] + + assert all(a == adjusted_lr2[0] for a in adjusted_lr2), \ + 'lr not equally adjusted for all param groups for optimizer 2' + adjusted_lr2 = adjusted_lr2[0] + + # Called ones after end of epoch + assert init_lr * 0.1 == adjusted_lr1, \ + 'lr for optimizer 1 not adjusted correctly' + # Called every 3 steps, meaning for 1 epoch of 11 batches, it is called 3 times + assert init_lr * (0.1)**3 == adjusted_lr2, \ + 'lr for optimizer 2 not adjusted correctly' + # if __name__ == '__main__': # pytest.main([__file__]) From 4b966347134905f3f768a3bee165a80ced79fed2 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Tue, 25 Feb 2020 16:18:14 +0100 Subject: [PATCH 06/28] update documentation --- pytorch_lightning/core/lightning.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index fab1d3a878c3c..93779b468ed86 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -750,6 +750,14 @@ def configure_optimizers(self): discriminator_sched = CosineAnnealing(discriminator_opt, T_max=10) return [generator_opt, disriminator_opt], [discriminator_sched] + # example with learning_rate schedulers with step-frequency + def configure_optimizers(self): + gen_opt = Adam(self.model_gen.parameters(), lr=0.01) + dis_opt = Adam(self.model_disc.parameters(), lr=0.02) + gen_sched = [ExponentialLR(gen_opt, 0.99), 100] # called every 100 training step + dis_sched = CosineAnnealing(discriminator_opt, T_max=10) # called after each epoch + return [gen_opt, dis_opt], [gen_sched, dis_sched] + .. note:: Lightning calls .backward() and .step() on each optimizer and learning rate scheduler as needed. .. note:: If you use 16-bit precision (use_amp=True), Lightning will automatically @@ -765,6 +773,9 @@ def configure_optimizers(self): .. note:: If you need to control how often those optimizers step or override the default .step() schedule, override the `optimizer_step` hook. + .. note:: If you need to control how often each learning rate scheduler is called, you can return + them as [[scheduler1, freq1], [scheduler2, freq2], ...], where freq is then number of training + steps between each update """ From b3a8d09c5ccf6fc171a4296398bda768e4141f8e Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Tue, 25 Feb 2020 21:24:34 +0100 Subject: [PATCH 07/28] smaller fix --- pytorch_lightning/trainer/trainer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 71db9461bd519..b580ffe081ff0 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -705,7 +705,10 @@ def __init__( # creates a default one if none passed in self.configure_early_stopping(early_stop_callback) + # configure variables for learning rate scheduling + self.lr_steps = [] self.reduce_lr_on_plateau_scheduler = None + self.lr_step_reduce_on_plateau = None # configure checkpoint callback self.checkpoint_callback = checkpoint_callback @@ -987,7 +990,7 @@ def init_optimizers( # single output, single optimizer if isinstance(optimizers, Optimizer): - return [optimizers], [] + return [optimizers], None # two lists, optimizer + lr schedulers elif len(optimizers) == 2 and isinstance(optimizers[0], list): @@ -997,7 +1000,7 @@ def init_optimizers( # single list or tuple, multiple optimizer elif isinstance(optimizers, (list, tuple)): - return optimizers, [] + return optimizers, None # unknown configuration else: @@ -1010,7 +1013,6 @@ def init_optimizers( def configure_schedulers(self, schedulers: list): # Determine number of steps for each scheduler - self.lr_steps = [] for i, scheduler in enumerate(schedulers): if isinstance(scheduler, (list, tuple)): self.lr_steps.append(scheduler[1]) From 8fa7a03cd31f9330ae125016dcda8ad28a468b6b Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Wed, 26 Feb 2020 15:55:48 +0100 Subject: [PATCH 08/28] update to dict structure --- pytorch_lightning/trainer/trainer.py | 48 ++++++++++++++-------- pytorch_lightning/trainer/training_loop.py | 46 +++++++++------------ 2 files changed, 52 insertions(+), 42 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 87428209d3551..4f05821767442 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -5,6 +5,7 @@ from typing import Union, Optional, List, Dict, Tuple, Iterable import torch +from torch import optim import torch.distributed as dist import torch.multiprocessing as mp from torch.utils.data import DataLoader @@ -1038,7 +1039,7 @@ def init_optimizers( # two lists, optimizer + lr schedulers elif len(optimizers) == 2 and isinstance(optimizers[0], list): optimizers, lr_schedulers = optimizers - lr_schedulers, self.reduce_lr_on_plateau_scheduler = self.configure_schedulers(lr_schedulers) + lr_schedulers = self.configure_schedulers(lr_schedulers) return optimizers, lr_schedulers # single list or tuple, multiple optimizer @@ -1055,22 +1056,37 @@ def init_optimizers( 'second being a list of torch.optim.lr_scheduler') def configure_schedulers(self, schedulers: list): - # Determine number of steps for each scheduler - for i, scheduler in enumerate(schedulers): - if isinstance(scheduler, (list, tuple)): - self.lr_steps.append(scheduler[1]) - schedulers[i] = scheduler[0] + # Convert each scheduler into dict sturcture with relevant information + lr_schedulers = [] + for scheduler in schedulers: + if isinstance(scheduler, dict): + if not 'scheduler' in scheduler: + raise ValueError(f'Lr scheduler should have key `scheduler`', + 'with item being a lr scheduler') + if not 'interval' in scheduler: + scheduler['interval'] = 'epoch' # default every epoch + if not 'frequency' in scheduler: + scheduler['frequency'] = 1 # default every step + scheduler['reduce_on_plateau'] = \ + isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau) + + lr_schedulers.append(scheduler) + + elif isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau): + lr_schedulers.append({'scheduler': scheduler, + 'interval': 'epoch', + 'frequency': 1, + 'reduce_on_plateau': True}) + + elif isinstance(scheduler, optim.lr_scheduler._LRScheduler): + lr_schedulers.append({'scheduler': scheduler, + 'interval': 'epoch', + 'frequency': 1, + 'reduce_on_plateau': False}) else: - self.lr_steps.append(None) - - # Special case if scheduler is ReduceLROnPlateau - for i, scheduler in enumerate(schedulers): - if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): - reduce_lr_on_plateau_scheduler = schedulers.pop(i) - self.lr_step_reduce_on_plateau = self.lr_steps.pop(i) - return schedulers, reduce_lr_on_plateau_scheduler - - return schedulers, None + raise ValueError(f'Input {scheduler} to lr schedulers ' + 'is a invalid input.') + return lr_schedulers def run_pretrain_routine(self, model: LightningModule): """Sanity check a few things before starting actual training. diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 6f8f3cc763f73..1b0f6222b3906 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -393,7 +393,7 @@ def train(self): self.run_training_epoch() # update LR schedulers - self.update_lr() + self.update_lr(interval='epoch') if self.max_steps and self.max_steps == self.global_step: self.main_progress_bar.close() @@ -468,7 +468,7 @@ def run_training_epoch(self): early_stop_epoch = batch_result == -1 # update lr - self.update_lr(current_step=self.total_batch_idx) + self.update_lr(interval='batch') # --------------- # RUN VAL STEP @@ -735,28 +735,22 @@ def training_forward(self, batch, batch_idx, opt_idx, hiddens): return output - def update_lr(self, current_step=None): - # update LR schedulers + def update_lr(self, interval): + ''' Update lr ''' if self.lr_schedulers is not None: - for lr_scheduler, lr_step in zip(self.lr_schedulers, self.lr_steps): - if current_step and lr_step: - if current_step % lr_step == 0: - lr_scheduler.step() - # if user did not supply lr_step, then this is invoked after after each epoch - elif current_step is None and lr_step is None: - lr_scheduler.step() - - if self.reduce_lr_on_plateau_scheduler is not None: - val_loss = self.callback_metrics.get('val_loss') - if val_loss is None: - avail_metrics = ','.join(list(self.callback_metrics.keys())) - m = f'ReduceLROnPlateau conditioned on metric val_loss ' \ - f'which is not available. Available metrics are: {avail_metrics}' - raise MisconfigurationException(m) - - if current_step and self.lr_step_reduce_on_plateau: - if current_step % self.lr_step_reduce_on_plateau == 0: - self.reduce_lr_on_plateau_scheduler.step(val_loss) - - elif current_step is None and lr_step is None: - self.reduce_lr_on_plateau_scheduler.step(val_loss) + for lr_scheduler in self.lr_schedulers: + current_step_idx = self.batch_idx if interval=='batch' else self.current_epoch + # Take step if call to update_lr matches the interval key and + # the current step modulo the schedulers frequency is zero + if lr_scheduler['interval'] == interval and current_step_idx % lr_scheduler['frequency'] == 0: + # If instance of ReduceLROnPlateau, we need to pass validation loss + if lr_scheduler['reduce_on_plateau']: + val_loss = self.callback_metrics.get('val_loss') + if val_loss is None: + avail_metrics = ','.join(list(self.callback_metrics.keys())) + m = f'ReduceLROnPlateau conditioned on metric val_loss ' \ + f'which is not available. Available metrics are: {avail_metrics}' + raise MisconfigurationException(m) + lr_scheduler['scheduler'].step(val_loss) + else: + lr_scheduler['scheduler'].step() From 12a526cbcbf9bb7356d5f8a6d1b072ff99fdc35f Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Wed, 26 Feb 2020 16:43:28 +0100 Subject: [PATCH 09/28] updated test --- tests/models/__init__.py | 2 +- tests/models/mixins.py | 7 ++++--- tests/test_trainer.py | 8 ++++---- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/models/__init__.py b/tests/models/__init__.py index 24ba90e8ea6e6..e5a162e483134 100644 --- a/tests/models/__init__.py +++ b/tests/models/__init__.py @@ -21,7 +21,7 @@ LightTestDataloader, LightTestOptimizerWithSchedulingMixin, LightTestMultipleOptimizersWithSchedulingMixin, - LightTestOptimizersWithSchedulingAndStepsMixin + LightTestOptimizersWithMixedSchedulingMixin ) diff --git a/tests/models/mixins.py b/tests/models/mixins.py index 260b23fcf63a4..e0a12ea947a3f 100644 --- a/tests/models/mixins.py +++ b/tests/models/mixins.py @@ -622,7 +622,7 @@ def configure_optimizers(self): return [optimizer1, optimizer2], [lr_scheduler1, lr_scheduler2] -class LightTestOptimizersWithSchedulingAndStepsMixin: +class LightTestOptimizersWithMixedSchedulingMixin: def configure_optimizers(self): if self.hparams.optimizer_name == 'lbfgs': optimizer1 = optim.LBFGS(self.parameters(), lr=self.hparams.learning_rate) @@ -630,10 +630,11 @@ def configure_optimizers(self): else: optimizer1 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) optimizer2 = optim.Adam(self.parameters(), lr=self.hparams.learning_rate) - lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, 1, gamma=0.1) + lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, 4, gamma=0.1) lr_scheduler2 = optim.lr_scheduler.StepLR(optimizer2, 1, gamma=0.1) - return [optimizer1, optimizer2], [lr_scheduler1, [lr_scheduler2, 3]] + return [optimizer1, optimizer2], \ + [{'scheduler': lr_scheduler1, 'interval': 'batch'}, lr_scheduler2] def _get_output_metric(output, name): diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 357acaf5ecb2e..e19d71c62ae44 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -27,7 +27,7 @@ LightTestMixin, LightTestOptimizerWithSchedulingMixin, LightTestMultipleOptimizersWithSchedulingMixin, - LightTestOptimizersWithSchedulingAndStepsMixin + LightTestOptimizersWithMixedSchedulingMixin ) from pytorch_lightning.core.lightning import load_hparams_from_tags_csv from pytorch_lightning.trainer.logging import TrainerLoggingMixin @@ -1061,7 +1061,7 @@ def test_multi_optimizer_with_scheduling_stepping(tmpdir): tutils.reset_seed() class CurrentTestModel( - LightTestOptimizersWithSchedulingAndStepsMixin, + LightTestOptimizersWithMixedSchedulingMixin, LightTrainDataloader, TestModelBase): pass @@ -1097,10 +1097,10 @@ class CurrentTestModel( adjusted_lr2 = adjusted_lr2[0] # Called ones after end of epoch - assert init_lr * 0.1 == adjusted_lr1, \ + assert init_lr * (0.1)**3 == adjusted_lr1, \ 'lr for optimizer 1 not adjusted correctly' # Called every 3 steps, meaning for 1 epoch of 11 batches, it is called 3 times - assert init_lr * (0.1)**3 == adjusted_lr2, \ + assert init_lr * 0.1 == adjusted_lr2, \ 'lr for optimizer 2 not adjusted correctly' From a3ac63f08944f9f7966e9cd056a9a4731ffcc51b Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Wed, 26 Feb 2020 16:53:04 +0100 Subject: [PATCH 10/28] update documentation --- pytorch_lightning/core/lightning.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index a47c3d70469cc..3ea0b1074aca0 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -751,11 +751,12 @@ def configure_optimizers(self): discriminator_sched = CosineAnnealing(discriminator_opt, T_max=10) return [generator_opt, disriminator_opt], [discriminator_sched] - # example with learning_rate schedulers with step-frequency + # example with step-based learning_rate schedulers def configure_optimizers(self): gen_opt = Adam(self.model_gen.parameters(), lr=0.01) dis_opt = Adam(self.model_disc.parameters(), lr=0.02) - gen_sched = [ExponentialLR(gen_opt, 0.99), 100] # called every 100 training step + gen_sched = {'scheduler': ExponentialLR(gen_opt, 0.99), + 'interval': 'batch'] # called after each step dis_sched = CosineAnnealing(discriminator_opt, T_max=10) # called after each epoch return [gen_opt, dis_opt], [gen_sched, dis_sched] @@ -774,9 +775,8 @@ def configure_optimizers(self): .. note:: If you need to control how often those optimizers step or override the default .step() schedule, override the `optimizer_step` hook. - .. note:: If you need to control how often each learning rate scheduler is called, you can return - them as [[scheduler1, freq1], [scheduler2, freq2], ...], where freq is then number of training - steps between each update + .. note:: If you only want to call a learning rate schduler every `x` batch or epoch, + you can input this as 'frequency' key: dict(scheduler=lr_schudler, interval='batch' or 'epoch', frequency=x) """ From fc4847b963f659487b3118330bc5214b93cc223a Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Wed, 26 Feb 2020 16:57:14 +0100 Subject: [PATCH 11/28] update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee1349806af6c..752339b42dd6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added semantic segmentation example ([#751](https://github.com/PyTorchLightning/pytorch-lightning/pull/751),[#876](https://github.com/PyTorchLightning/pytorch-lightning/pull/876)) - Split callbacks in multiple files ([#849](https://github.com/PyTorchLightning/pytorch-lightning/pull/849)) - Added support for multiple loggers to be passed to `Trainer` as an iterable (e.g. list, tuple, etc.) ([#903](https://github.com/PyTorchLightning/pytorch-lightning/pull/903)) +- Added support for step-based learning rate scheduling ([#941](https://github.com/PyTorchLightning/pytorch-lightning/pull/941)) ### Changed From 4167975ed4e455a237b3b16cbee8c6a7bd12657f Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Wed, 26 Feb 2020 19:14:35 +0100 Subject: [PATCH 12/28] fix styling --- pytorch_lightning/core/lightning.py | 4 ++-- pytorch_lightning/trainer/trainer.py | 14 +++++++------- pytorch_lightning/trainer/training_loop.py | 2 +- tests/test_trainer.py | 3 ++- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 3ea0b1074aca0..1317e00b4cf4f 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -751,11 +751,11 @@ def configure_optimizers(self): discriminator_sched = CosineAnnealing(discriminator_opt, T_max=10) return [generator_opt, disriminator_opt], [discriminator_sched] - # example with step-based learning_rate schedulers + # example with step-based learning_rate schedulers def configure_optimizers(self): gen_opt = Adam(self.model_gen.parameters(), lr=0.01) dis_opt = Adam(self.model_disc.parameters(), lr=0.02) - gen_sched = {'scheduler': ExponentialLR(gen_opt, 0.99), + gen_sched = {'scheduler': ExponentialLR(gen_opt, 0.99), 'interval': 'batch'] # called after each step dis_sched = CosineAnnealing(discriminator_opt, T_max=10) # called after each epoch return [gen_opt, dis_opt], [gen_sched, dis_sched] diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 4f05821767442..5e2df2120b72b 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1060,24 +1060,24 @@ def configure_schedulers(self, schedulers: list): lr_schedulers = [] for scheduler in schedulers: if isinstance(scheduler, dict): - if not 'scheduler' in scheduler: + if 'scheduler' not in scheduler: raise ValueError(f'Lr scheduler should have key `scheduler`', 'with item being a lr scheduler') - if not 'interval' in scheduler: - scheduler['interval'] = 'epoch' # default every epoch - if not 'frequency' in scheduler: - scheduler['frequency'] = 1 # default every step + if 'interval' not in scheduler: + scheduler['interval'] = 'epoch' # default every epoch + if 'frequency' not in scheduler: + scheduler['frequency'] = 1 # default every step scheduler['reduce_on_plateau'] = \ isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau) lr_schedulers.append(scheduler) - + elif isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau): lr_schedulers.append({'scheduler': scheduler, 'interval': 'epoch', 'frequency': 1, 'reduce_on_plateau': True}) - + elif isinstance(scheduler, optim.lr_scheduler._LRScheduler): lr_schedulers.append({'scheduler': scheduler, 'interval': 'epoch', diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 1b0f6222b3906..a1f38b5180727 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -739,7 +739,7 @@ def update_lr(self, interval): ''' Update lr ''' if self.lr_schedulers is not None: for lr_scheduler in self.lr_schedulers: - current_step_idx = self.batch_idx if interval=='batch' else self.current_epoch + current_step_idx = self.batch_idx if interval == 'batch' else self.current_epoch # Take step if call to update_lr matches the interval key and # the current step modulo the schedulers frequency is zero if lr_scheduler['interval'] == interval and current_step_idx % lr_scheduler['frequency'] == 0: diff --git a/tests/test_trainer.py b/tests/test_trainer.py index e19d71c62ae44..862888113652d 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -974,6 +974,7 @@ def on_test_end(self, trainer, pl_module): assert test_callback.on_test_start_called assert test_callback.on_test_end_called + def test_optimizer_with_scheduling(tmpdir): """ Verify that learning rate scheduling is working """ tutils.reset_seed() @@ -983,7 +984,7 @@ class CurrentTestModel( LightTrainDataloader, TestModelBase): pass - + hparams = tutils.get_hparams() model = CurrentTestModel(hparams) From 6e2d712315f7e630e0624663e30fdd15043bbaf7 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Wed, 26 Feb 2020 19:31:04 +0100 Subject: [PATCH 13/28] fix problems with trainer io --- pytorch_lightning/trainer/trainer.py | 4 ++-- pytorch_lightning/trainer/training_io.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 5e2df2120b72b..51af2e466d3ac 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1034,7 +1034,7 @@ def init_optimizers( # single output, single optimizer if isinstance(optimizers, Optimizer): - return [optimizers], None + return [optimizers], [] # two lists, optimizer + lr schedulers elif len(optimizers) == 2 and isinstance(optimizers[0], list): @@ -1044,7 +1044,7 @@ def init_optimizers( # single list or tuple, multiple optimizer elif isinstance(optimizers, (list, tuple)): - return optimizers, None + return optimizers, [] # unknown configuration else: diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py index 569c571838aa6..6404c075e392a 100644 --- a/pytorch_lightning/trainer/training_io.py +++ b/pytorch_lightning/trainer/training_io.py @@ -82,7 +82,7 @@ # restore the lr schedulers lr_schedulers = checkpoint['lr_schedulers'] for scheduler, lrs_state in zip(self.lr_schedulers, lr_schedulers): - scheduler.load_state_dict(lrs_state) + scheduler['scheduler'].load_state_dict(lrs_state) # uses the model you passed into trainer model.load_state_dict(checkpoint['state_dict']) @@ -344,7 +344,7 @@ def dump_checkpoint(self): # save lr schedulers lr_schedulers = [] for i, scheduler in enumerate(self.lr_schedulers): - lr_schedulers.append(scheduler.state_dict()) + lr_schedulers.append(scheduler['scheduler'].state_dict()) checkpoint['lr_schedulers'] = lr_schedulers @@ -431,7 +431,7 @@ def restore_training_state(self, checkpoint): # restore the lr schedulers lr_schedulers = checkpoint['lr_schedulers'] for scheduler, lrs_state in zip(self.lr_schedulers, lr_schedulers): - scheduler.load_state_dict(lrs_state) + scheduler['scheduler'].load_state_dict(lrs_state) # ---------------------------------- # PRIVATE OPS From 7d18fab44ff16b4933ede6173e16219522da2d01 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Wed, 26 Feb 2020 19:51:03 +0100 Subject: [PATCH 14/28] fix tests --- tests/test_gpu_models.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_gpu_models.py b/tests/test_gpu_models.py index a47b97f95ed91..381124c1b1703 100644 --- a/tests/test_gpu_models.py +++ b/tests/test_gpu_models.py @@ -91,10 +91,13 @@ def test_optimizer_return_options(): assert len(lr_sched) == 0 # opt tuple of lists - opts = ([opt_a], ['lr_scheduler']) + scheduler = torch.optim.lr_scheduler.StepLR(opt_a, 10) + opts = ([opt_a], [scheduler]) optim, lr_sched = trainer.init_optimizers(opts) assert len(optim) == 1 and len(lr_sched) == 1 - assert optim[0] == opts[0][0] and lr_sched[0] == 'lr_scheduler' + assert optim[0] == opts[0][0] and \ + lr_sched[0] == dict(scheduler=scheduler, interval='epoch', + frequency=1, reduce_on_plateau=False) def test_cpu_slurm_save_load(tmpdir): From f01597d19a811dc9514a078ba381e79e202b1253 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Thu, 27 Feb 2020 11:43:34 +0100 Subject: [PATCH 15/28] simplification of code --- pytorch_lightning/trainer/trainer.py | 25 ++++++---------------- pytorch_lightning/trainer/training_io.py | 2 +- pytorch_lightning/trainer/training_loop.py | 8 +++---- 3 files changed, 12 insertions(+), 23 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 429962a968c0d..8cfc2516eaff8 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -740,11 +740,6 @@ def on_train_end(self): # creates a default one if none passed in self.configure_early_stopping(early_stop_callback) - # configure variables for learning rate scheduling - self.lr_steps = [] - self.reduce_lr_on_plateau_scheduler = None - self.lr_step_reduce_on_plateau = None - # configure checkpoint callback self.checkpoint_callback = checkpoint_callback self.weights_save_path = weights_save_path @@ -1058,31 +1053,25 @@ def init_optimizers( def configure_schedulers(self, schedulers: list): # Convert each scheduler into dict sturcture with relevant information lr_schedulers = [] + default_config = {'interval': 'epoch', # default every epoch + 'frequency': 1, # default every epoch/batch + 'reduce_on_plateau': False} # most often not ReduceLROnPlateau scheduler} for scheduler in schedulers: if isinstance(scheduler, dict): if 'scheduler' not in scheduler: raise ValueError(f'Lr scheduler should have key `scheduler`', - 'with item being a lr scheduler') - if 'interval' not in scheduler: - scheduler['interval'] = 'epoch' # default every epoch - if 'frequency' not in scheduler: - scheduler['frequency'] = 1 # default every step + ' with item being a lr scheduler') scheduler['reduce_on_plateau'] = \ isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau) - lr_schedulers.append(scheduler) + lr_schedulers.append({**default_config, **scheduler}) elif isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau): - lr_schedulers.append({'scheduler': scheduler, - 'interval': 'epoch', - 'frequency': 1, + lr_schedulers.append({**default_config, 'scheduler': scheduler, 'reduce_on_plateau': True}) elif isinstance(scheduler, optim.lr_scheduler._LRScheduler): - lr_schedulers.append({'scheduler': scheduler, - 'interval': 'epoch', - 'frequency': 1, - 'reduce_on_plateau': False}) + lr_schedulers.append({**default_config, 'scheduler': scheduler}) else: raise ValueError(f'Input {scheduler} to lr schedulers ' 'is a invalid input.') diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py index 6404c075e392a..f6f242961716e 100644 --- a/pytorch_lightning/trainer/training_io.py +++ b/pytorch_lightning/trainer/training_io.py @@ -343,7 +343,7 @@ def dump_checkpoint(self): # save lr schedulers lr_schedulers = [] - for i, scheduler in enumerate(self.lr_schedulers): + for scheduler in self.lr_schedulers: lr_schedulers.append(scheduler['scheduler'].state_dict()) checkpoint['lr_schedulers'] = lr_schedulers diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index d807720577082..c2da4adcde0e9 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -400,7 +400,7 @@ def train(self): self.run_training_epoch() # update LR schedulers - self.update_lr(interval='epoch') + self.update_learning_rates(interval='epoch') if self.max_steps and self.max_steps == self.global_step: self.main_progress_bar.close() @@ -478,7 +478,7 @@ def run_training_epoch(self): early_stop_epoch = batch_result == -1 # update lr - self.update_lr(interval='batch') + self.update_learning_rates(interval='batch') # --------------- # RUN VAL STEP @@ -745,12 +745,12 @@ def training_forward(self, batch, batch_idx, opt_idx, hiddens): return output - def update_lr(self, interval): + def update_learning_rates(self, interval): ''' Update lr ''' if self.lr_schedulers is not None: for lr_scheduler in self.lr_schedulers: current_step_idx = self.batch_idx if interval == 'batch' else self.current_epoch - # Take step if call to update_lr matches the interval key and + # Take step if call to update_learning_rates matches the interval key and # the current step modulo the schedulers frequency is zero if lr_scheduler['interval'] == interval and current_step_idx % lr_scheduler['frequency'] == 0: # If instance of ReduceLROnPlateau, we need to pass validation loss From 55d966100f4c4907ebbe83f0a72bf751a06fbf77 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Thu, 27 Feb 2020 12:18:45 +0100 Subject: [PATCH 16/28] fix styling --- pytorch_lightning/trainer/trainer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 8cfc2516eaff8..0389a881371da 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1053,9 +1053,9 @@ def init_optimizers( def configure_schedulers(self, schedulers: list): # Convert each scheduler into dict sturcture with relevant information lr_schedulers = [] - default_config = {'interval': 'epoch', # default every epoch - 'frequency': 1, # default every epoch/batch - 'reduce_on_plateau': False} # most often not ReduceLROnPlateau scheduler} + default_config = {'interval': 'epoch', # default every epoch + 'frequency': 1, # default every epoch/batch + 'reduce_on_plateau': False} # most often not ReduceLROnPlateau scheduler for scheduler in schedulers: if isinstance(scheduler, dict): if 'scheduler' not in scheduler: From 27669101d08056d97d18cb7f38e3000a0ad65448 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Fri, 28 Feb 2020 10:29:54 +0100 Subject: [PATCH 17/28] change from batch to step --- pytorch_lightning/core/lightning.py | 6 +-- pytorch_lightning/trainer/training_loop.py | 43 ++++++++++++---------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 1317e00b4cf4f..1ee64175e948c 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -756,7 +756,7 @@ def configure_optimizers(self): gen_opt = Adam(self.model_gen.parameters(), lr=0.01) dis_opt = Adam(self.model_disc.parameters(), lr=0.02) gen_sched = {'scheduler': ExponentialLR(gen_opt, 0.99), - 'interval': 'batch'] # called after each step + 'interval': 'step'] # called after each training step dis_sched = CosineAnnealing(discriminator_opt, T_max=10) # called after each epoch return [gen_opt, dis_opt], [gen_sched, dis_sched] @@ -775,8 +775,8 @@ def configure_optimizers(self): .. note:: If you need to control how often those optimizers step or override the default .step() schedule, override the `optimizer_step` hook. - .. note:: If you only want to call a learning rate schduler every `x` batch or epoch, - you can input this as 'frequency' key: dict(scheduler=lr_schudler, interval='batch' or 'epoch', frequency=x) + .. note:: If you only want to call a learning rate schduler every `x` step or epoch, + you can input this as 'frequency' key: dict(scheduler=lr_schudler, interval='step' or 'epoch', frequency=x) """ diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index c2da4adcde0e9..a5bda198904aa 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -478,7 +478,7 @@ def run_training_epoch(self): early_stop_epoch = batch_result == -1 # update lr - self.update_learning_rates(interval='batch') + self.update_learning_rates(interval='step') # --------------- # RUN VAL STEP @@ -746,21 +746,26 @@ def training_forward(self, batch, batch_idx, opt_idx, hiddens): return output def update_learning_rates(self, interval): - ''' Update lr ''' - if self.lr_schedulers is not None: - for lr_scheduler in self.lr_schedulers: - current_step_idx = self.batch_idx if interval == 'batch' else self.current_epoch - # Take step if call to update_learning_rates matches the interval key and - # the current step modulo the schedulers frequency is zero - if lr_scheduler['interval'] == interval and current_step_idx % lr_scheduler['frequency'] == 0: - # If instance of ReduceLROnPlateau, we need to pass validation loss - if lr_scheduler['reduce_on_plateau']: - val_loss = self.callback_metrics.get('val_loss') - if val_loss is None: - avail_metrics = ','.join(list(self.callback_metrics.keys())) - m = f'ReduceLROnPlateau conditioned on metric val_loss ' \ - f'which is not available. Available metrics are: {avail_metrics}' - raise MisconfigurationException(m) - lr_scheduler['scheduler'].step(val_loss) - else: - lr_scheduler['scheduler'].step() + ''' Update learning rates + Args: + interval (str): either 'epoch' or 'step'. + ''' + if not self.lr_schedulers: + return + + for lr_scheduler in self.lr_schedulers: + current_idx = self.batch_idx if interval == 'step' else self.current_epoch + # Take step if call to update_learning_rates matches the interval key and + # the current step modulo the schedulers frequency is zero + if lr_scheduler['interval'] == interval and current_idx % lr_scheduler['frequency'] == 0: + # If instance of ReduceLROnPlateau, we need to pass validation loss + if lr_scheduler['reduce_on_plateau']: + val_loss = self.callback_metrics.get('val_loss') + if val_loss is None: + avail_metrics = ','.join(list(self.callback_metrics.keys())) + m = f'ReduceLROnPlateau conditioned on metric val_loss ' \ + f'which is not available. Available metrics are: {avail_metrics}' + raise MisconfigurationException(m) + lr_scheduler['scheduler'].step(val_loss) + else: + lr_scheduler['scheduler'].step() From 2c848b9a99f8ccdb5c1e1801cf60b6cededa2fcc Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Fri, 28 Feb 2020 10:36:32 +0100 Subject: [PATCH 18/28] update to tests --- tests/trainer/test_optimizers.py | 150 +++++++++++++++++++++++++++++++ tests/trainer/test_trainer.py | 135 +--------------------------- 2 files changed, 151 insertions(+), 134 deletions(-) create mode 100644 tests/trainer/test_optimizers.py diff --git a/tests/trainer/test_optimizers.py b/tests/trainer/test_optimizers.py new file mode 100644 index 0000000000000..495a3460063d6 --- /dev/null +++ b/tests/trainer/test_optimizers.py @@ -0,0 +1,150 @@ +import math +import os + +import pytest +import torch + +import tests.models.utils as tutils +from pytorch_lightning import Trainer + +from tests.models import ( + TestModelBase, + LightTrainDataloader, + LightTestOptimizerWithSchedulingMixin, + LightTestMultipleOptimizersWithSchedulingMixin, + LightTestOptimizersWithMixedSchedulingMixin +) + + +def test_optimizer_with_scheduling(tmpdir): + """ Verify that learning rate scheduling is working """ + tutils.reset_seed() + + class CurrentTestModel( + LightTestOptimizerWithSchedulingMixin, + LightTrainDataloader, + TestModelBase): + pass + + hparams = tutils.get_hparams() + model = CurrentTestModel(hparams) + + # logger file to get meta + trainer_options = dict( + default_save_path=tmpdir, + max_epochs=1, + val_percent_check=0.1, + train_percent_check=0.2 + ) + + # fit model + trainer = Trainer(**trainer_options) + results = trainer.fit(model) + + init_lr = hparams.learning_rate + adjusted_lr = [pg['lr'] for pg in trainer.optimizers[0].param_groups] + + assert len(trainer.lr_schedulers) == 1, \ + 'lr scheduler not initialized properly' + + assert all(a == adjusted_lr[0] for a in adjusted_lr), \ + 'lr not equally adjusted for all param groups' + adjusted_lr = adjusted_lr[0] + + assert init_lr * 0.1 == adjusted_lr, \ + 'lr not adjusted correctly' + + +def test_multi_optimizer_with_scheduling(tmpdir): + """ Verify that learning rate scheduling is working """ + tutils.reset_seed() + + class CurrentTestModel( + LightTestMultipleOptimizersWithSchedulingMixin, + LightTrainDataloader, + TestModelBase): + pass + + hparams = tutils.get_hparams() + model = CurrentTestModel(hparams) + + # logger file to get meta + trainer_options = dict( + default_save_path=tmpdir, + max_epochs=1, + val_percent_check=0.1, + train_percent_check=0.2 + ) + + # fit model + trainer = Trainer(**trainer_options) + results = trainer.fit(model) + + init_lr = hparams.learning_rate + adjusted_lr1 = [pg['lr'] for pg in trainer.optimizers[0].param_groups] + adjusted_lr2 = [pg['lr'] for pg in trainer.optimizers[1].param_groups] + + assert len(trainer.lr_schedulers) == 2, \ + 'all lr scheduler not initialized properly' + + assert all(a == adjusted_lr1[0] for a in adjusted_lr1), \ + 'lr not equally adjusted for all param groups for optimizer 1' + adjusted_lr1 = adjusted_lr1[0] + + assert all(a == adjusted_lr2[0] for a in adjusted_lr2), \ + 'lr not equally adjusted for all param groups for optimizer 2' + adjusted_lr2 = adjusted_lr2[0] + + assert init_lr * 0.1 == adjusted_lr1 and init_lr * 0.1 == adjusted_lr2, \ + 'lr not adjusted correctly' + + +def test_multi_optimizer_with_scheduling_stepping(tmpdir): + tutils.reset_seed() + + class CurrentTestModel( + LightTestOptimizersWithMixedSchedulingMixin, + LightTrainDataloader, + TestModelBase): + pass + + hparams = tutils.get_hparams() + model = CurrentTestModel(hparams) + + # logger file to get meta + trainer_options = dict( + default_save_path=tmpdir, + max_epochs=1, + val_percent_check=0.1, + train_percent_check=0.2 + ) + + # fit model + trainer = Trainer(**trainer_options) + results = trainer.fit(model) + + init_lr = hparams.learning_rate + adjusted_lr1 = [pg['lr'] for pg in trainer.optimizers[0].param_groups] + adjusted_lr2 = [pg['lr'] for pg in trainer.optimizers[1].param_groups] + + assert len(trainer.lr_schedulers) == 2, \ + 'all lr scheduler not initialized properly' + + assert all(a == adjusted_lr1[0] for a in adjusted_lr1), \ + 'lr not equally adjusted for all param groups for optimizer 1' + adjusted_lr1 = adjusted_lr1[0] + + assert all(a == adjusted_lr2[0] for a in adjusted_lr2), \ + 'lr not equally adjusted for all param groups for optimizer 2' + adjusted_lr2 = adjusted_lr2[0] + + # Called ones after end of epoch + assert init_lr * (0.1)**3 == adjusted_lr1, \ + 'lr for optimizer 1 not adjusted correctly' + # Called every 3 steps, meaning for 1 epoch of 11 batches, it is called 3 times + assert init_lr * 0.1 == adjusted_lr2, \ + 'lr for optimizer 2 not adjusted correctly' + + +# if __name__ == '__main__': +# pytest.main([__file__]) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 05a6b4e3c2d60..4d683b96bdd9a 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -19,10 +19,7 @@ LightTrainDataloader, LightTestDataloader, LightValidationMixin, - LightTestMixin, - LightTestOptimizerWithSchedulingMixin, - LightTestMultipleOptimizersWithSchedulingMixin, - LightTestOptimizersWithMixedSchedulingMixin + LightTestMixin ) from pytorch_lightning.core.lightning import load_hparams_from_tags_csv from pytorch_lightning.trainer.logging import TrainerLoggingMixin @@ -717,135 +714,5 @@ def on_test_end(self, trainer, pl_module): assert test_callback.on_test_end_called -def test_optimizer_with_scheduling(tmpdir): - """ Verify that learning rate scheduling is working """ - tutils.reset_seed() - - class CurrentTestModel( - LightTestOptimizerWithSchedulingMixin, - LightTrainDataloader, - TestModelBase): - pass - - hparams = tutils.get_hparams() - model = CurrentTestModel(hparams) - - # logger file to get meta - trainer_options = dict( - default_save_path=tmpdir, - max_epochs=1, - val_percent_check=0.1, - train_percent_check=0.2 - ) - - # fit model - trainer = Trainer(**trainer_options) - results = trainer.fit(model) - - init_lr = hparams.learning_rate - adjusted_lr = [pg['lr'] for pg in trainer.optimizers[0].param_groups] - - assert len(trainer.lr_schedulers) == 1, \ - 'lr scheduler not initialized properly' - - assert all(a == adjusted_lr[0] for a in adjusted_lr), \ - 'lr not equally adjusted for all param groups' - adjusted_lr = adjusted_lr[0] - - assert init_lr * 0.1 == adjusted_lr, \ - 'lr not adjusted correctly' - - -def test_multi_optimizer_with_scheduling(tmpdir): - """ Verify that learning rate scheduling is working """ - tutils.reset_seed() - - class CurrentTestModel( - LightTestMultipleOptimizersWithSchedulingMixin, - LightTrainDataloader, - TestModelBase): - pass - - hparams = tutils.get_hparams() - model = CurrentTestModel(hparams) - - # logger file to get meta - trainer_options = dict( - default_save_path=tmpdir, - max_epochs=1, - val_percent_check=0.1, - train_percent_check=0.2 - ) - - # fit model - trainer = Trainer(**trainer_options) - results = trainer.fit(model) - - init_lr = hparams.learning_rate - adjusted_lr1 = [pg['lr'] for pg in trainer.optimizers[0].param_groups] - adjusted_lr2 = [pg['lr'] for pg in trainer.optimizers[1].param_groups] - - assert len(trainer.lr_schedulers) == 2, \ - 'all lr scheduler not initialized properly' - - assert all(a == adjusted_lr1[0] for a in adjusted_lr1), \ - 'lr not equally adjusted for all param groups for optimizer 1' - adjusted_lr1 = adjusted_lr1[0] - - assert all(a == adjusted_lr2[0] for a in adjusted_lr2), \ - 'lr not equally adjusted for all param groups for optimizer 2' - adjusted_lr2 = adjusted_lr2[0] - - assert init_lr * 0.1 == adjusted_lr1 and init_lr * 0.1 == adjusted_lr2, \ - 'lr not adjusted correctly' - - -def test_multi_optimizer_with_scheduling_stepping(tmpdir): - tutils.reset_seed() - - class CurrentTestModel( - LightTestOptimizersWithMixedSchedulingMixin, - LightTrainDataloader, - TestModelBase): - pass - - hparams = tutils.get_hparams() - model = CurrentTestModel(hparams) - - # logger file to get meta - trainer_options = dict( - default_save_path=tmpdir, - max_epochs=1, - val_percent_check=0.1, - train_percent_check=0.2 - ) - - # fit model - trainer = Trainer(**trainer_options) - results = trainer.fit(model) - - init_lr = hparams.learning_rate - adjusted_lr1 = [pg['lr'] for pg in trainer.optimizers[0].param_groups] - adjusted_lr2 = [pg['lr'] for pg in trainer.optimizers[1].param_groups] - - assert len(trainer.lr_schedulers) == 2, \ - 'all lr scheduler not initialized properly' - - assert all(a == adjusted_lr1[0] for a in adjusted_lr1), \ - 'lr not equally adjusted for all param groups for optimizer 1' - adjusted_lr1 = adjusted_lr1[0] - - assert all(a == adjusted_lr2[0] for a in adjusted_lr2), \ - 'lr not equally adjusted for all param groups for optimizer 2' - adjusted_lr2 = adjusted_lr2[0] - - # Called ones after end of epoch - assert init_lr * (0.1)**3 == adjusted_lr1, \ - 'lr for optimizer 1 not adjusted correctly' - # Called every 3 steps, meaning for 1 epoch of 11 batches, it is called 3 times - assert init_lr * 0.1 == adjusted_lr2, \ - 'lr for optimizer 2 not adjusted correctly' - - # if __name__ == '__main__': # pytest.main([__file__]) From 1906239d6ce908b280b09b8be53e205f70e2fae5 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Fri, 28 Feb 2020 10:37:25 +0100 Subject: [PATCH 19/28] fix styling --- pytorch_lightning/trainer/training_loop.py | 4 ++-- tests/models/mixins.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index a5bda198904aa..f2feefd54bd51 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -746,10 +746,10 @@ def training_forward(self, batch, batch_idx, opt_idx, hiddens): return output def update_learning_rates(self, interval): - ''' Update learning rates + ''' Update learning rates Args: interval (str): either 'epoch' or 'step'. - ''' + ''' if not self.lr_schedulers: return diff --git a/tests/models/mixins.py b/tests/models/mixins.py index e0a12ea947a3f..6c3d8f908bfa5 100644 --- a/tests/models/mixins.py +++ b/tests/models/mixins.py @@ -634,7 +634,7 @@ def configure_optimizers(self): lr_scheduler2 = optim.lr_scheduler.StepLR(optimizer2, 1, gamma=0.1) return [optimizer1, optimizer2], \ - [{'scheduler': lr_scheduler1, 'interval': 'batch'}, lr_scheduler2] + [{'scheduler': lr_scheduler1, 'interval': 'step'}, lr_scheduler2] def _get_output_metric(output, name): From fc0ae096dbe868d36356dddedf6db424c73f5696 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Fri, 28 Feb 2020 10:54:46 +0100 Subject: [PATCH 20/28] fixed some logic --- pytorch_lightning/trainer/training_loop.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index f2feefd54bd51..e7ec119dca91f 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -755,6 +755,7 @@ def update_learning_rates(self, interval): for lr_scheduler in self.lr_schedulers: current_idx = self.batch_idx if interval == 'step' else self.current_epoch + current_idx += 1 # account for both batch and epoch starts from 0 # Take step if call to update_learning_rates matches the interval key and # the current step modulo the schedulers frequency is zero if lr_scheduler['interval'] == interval and current_idx % lr_scheduler['frequency'] == 0: From 44207bc9d943cc9ad6ca1faffb4f581cb17c2431 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 28 Feb 2020 15:16:50 +0100 Subject: [PATCH 21/28] Update pytorch_lightning/core/lightning.py --- pytorch_lightning/core/lightning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 1ee64175e948c..308c3c45adab9 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -756,7 +756,7 @@ def configure_optimizers(self): gen_opt = Adam(self.model_gen.parameters(), lr=0.01) dis_opt = Adam(self.model_disc.parameters(), lr=0.02) gen_sched = {'scheduler': ExponentialLR(gen_opt, 0.99), - 'interval': 'step'] # called after each training step + 'interval': 'step'} # called after each training step dis_sched = CosineAnnealing(discriminator_opt, T_max=10) # called after each epoch return [gen_opt, dis_opt], [gen_sched, dis_sched] From ec15729c1bece2749ea153148208431b4504e548 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 3 Mar 2020 11:23:10 +0100 Subject: [PATCH 22/28] duplicated test --- tests/trainer/test_trainer.py | 120 ---------------------------------- 1 file changed, 120 deletions(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 05da303048f17..4c16c921296d2 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -600,123 +600,3 @@ def test_end(self, outputs): model = LightningTestModel(hparams) Trainer().test(model) - - -def test_trainer_callback_system(tmpdir): - """Test the callback system.""" - - class CurrentTestModel( - LightTrainDataloader, - LightTestMixin, - LightValidationMixin, - TestModelBase, - ): - pass - - hparams = tutils.get_hparams() - model = CurrentTestModel(hparams) - - class TestCallback(Callback): - def __init__(self): - super().__init__() - self.on_init_start_called = False - self.on_init_end_called = False - self.on_fit_start_called = False - self.on_fit_end_called = False - self.on_epoch_start_called = False - self.on_epoch_end_called = False - self.on_batch_start_called = False - self.on_batch_end_called = False - self.on_train_start_called = False - self.on_train_end_called = False - self.on_validation_start_called = False - self.on_validation_end_called = False - self.on_test_start_called = False - self.on_test_end_called = False - - def on_init_start(self, trainer, pl_module): - self.on_init_start_called = True - - def on_init_end(self, trainer, pl_module): - self.on_init_end_called = True - - def on_fit_start(self, trainer, pl_module): - self.on_fit_start_called = True - - def on_fit_end(self, trainer, pl_module): - self.on_fit_end_called = True - - def on_epoch_start(self, trainer, pl_module): - self.on_epoch_start_called = True - - def on_epoch_end(self, trainer, pl_module): - self.on_epoch_end_called = True - - def on_batch_start(self, trainer, pl_module): - self.on_batch_start_called = True - - def on_batch_end(self, trainer, pl_module): - self.on_batch_end_called = True - - def on_train_start(self, trainer, pl_module): - self.on_train_start_called = True - - def on_train_end(self, trainer, pl_module): - self.on_train_end_called = True - - def on_validation_start(self, trainer, pl_module): - self.on_validation_start_called = True - - def on_validation_end(self, trainer, pl_module): - self.on_validation_end_called = True - - def on_test_start(self, trainer, pl_module): - self.on_test_start_called = True - - def on_test_end(self, trainer, pl_module): - self.on_test_end_called = True - - test_callback = TestCallback() - - trainer_options = {} - trainer_options['callbacks'] = [test_callback] - trainer_options['max_epochs'] = 1 - trainer_options['val_percent_check'] = 0.1 - trainer_options['train_percent_check'] = 0.2 - trainer_options['show_progress_bar'] = False - - assert not test_callback.on_init_start_called - assert not test_callback.on_init_end_called - - # fit model - trainer = Trainer(**trainer_options) - - assert trainer.callbacks[0] == test_callback - assert test_callback.on_init_start_called - assert test_callback.on_init_end_called - assert not test_callback.on_fit_start_called - assert not test_callback.on_fit_start_called - - trainer.fit(model) - - assert test_callback.on_fit_start_called - assert test_callback.on_fit_end_called - assert test_callback.on_epoch_start_called - assert test_callback.on_epoch_start_called - assert test_callback.on_batch_start_called - assert test_callback.on_batch_end_called - assert test_callback.on_train_start_called - assert test_callback.on_train_end_called - assert test_callback.on_validation_start_called - assert test_callback.on_validation_end_called - assert not test_callback.on_test_start_called - assert not test_callback.on_test_end_called - - trainer.test() - - assert test_callback.on_test_start_called - assert test_callback.on_test_end_called - - -# if __name__ == '__main__': -# pytest.main([__file__]) From 2e5e9ba0b6ec576674f6f46484b31dece6f9eba7 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Wed, 4 Mar 2020 08:37:23 +0100 Subject: [PATCH 23/28] fix test on amp --- tests/models/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/utils.py b/tests/models/utils.py index db517b3e28157..d71eb346819fa 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -82,7 +82,7 @@ def run_model_test(trainer_options, model, on_gpu=True): if trainer.use_ddp or trainer.use_ddp2: # on hpc this would work fine... but need to hack it for the purpose of the test trainer.model = pretrained_model - trainer.optimizers, trainer.lr_schedulers = pretrained_model.configure_optimizers() + trainer.optimizers, trainer.lr_schedulers = trainer.init_optimizers(pretrained_model.configure_optimizers()) # test HPC loading / saving trainer.hpc_save(save_dir, logger) From 8dc4c31fdd483dec17498888ac63c56e094cee9e Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Wed, 4 Mar 2020 11:33:13 +0100 Subject: [PATCH 24/28] small update to tests --- tests/trainer/test_optimizers.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/tests/trainer/test_optimizers.py b/tests/trainer/test_optimizers.py index 495a3460063d6..bc5dde5f75013 100644 --- a/tests/trainer/test_optimizers.py +++ b/tests/trainer/test_optimizers.py @@ -45,14 +45,14 @@ class CurrentTestModel( adjusted_lr = [pg['lr'] for pg in trainer.optimizers[0].param_groups] assert len(trainer.lr_schedulers) == 1, \ - 'lr scheduler not initialized properly' + 'lr scheduler not initialized properly, it has %i elements instread of 1' % len(trainer.lr_schedulers) assert all(a == adjusted_lr[0] for a in adjusted_lr), \ - 'lr not equally adjusted for all param groups' + 'Lr not equally adjusted for all param groups' adjusted_lr = adjusted_lr[0] assert init_lr * 0.1 == adjusted_lr, \ - 'lr not adjusted correctly' + 'Lr not adjusted correctly, expected %f but got %f' % (init_lr * 0.1, adjusted_lr) def test_multi_optimizer_with_scheduling(tmpdir): @@ -85,18 +85,18 @@ class CurrentTestModel( adjusted_lr2 = [pg['lr'] for pg in trainer.optimizers[1].param_groups] assert len(trainer.lr_schedulers) == 2, \ - 'all lr scheduler not initialized properly' + 'all lr scheduler not initialized properly, it has %i elements instread of 1' % len(trainer.lr_schedulers) assert all(a == adjusted_lr1[0] for a in adjusted_lr1), \ - 'lr not equally adjusted for all param groups for optimizer 1' + 'Lr not equally adjusted for all param groups for optimizer 1' adjusted_lr1 = adjusted_lr1[0] assert all(a == adjusted_lr2[0] for a in adjusted_lr2), \ - 'lr not equally adjusted for all param groups for optimizer 2' + 'Lr not equally adjusted for all param groups for optimizer 2' adjusted_lr2 = adjusted_lr2[0] assert init_lr * 0.1 == adjusted_lr1 and init_lr * 0.1 == adjusted_lr2, \ - 'lr not adjusted correctly' + 'Lr not adjusted correctly, expected %f but got %f' % (init_lr * 0.1, adjusted_lr1) def test_multi_optimizer_with_scheduling_stepping(tmpdir): @@ -144,7 +144,3 @@ class CurrentTestModel( # Called every 3 steps, meaning for 1 epoch of 11 batches, it is called 3 times assert init_lr * 0.1 == adjusted_lr2, \ 'lr for optimizer 2 not adjusted correctly' - - -# if __name__ == '__main__': -# pytest.main([__file__]) From 284afe59f44b3d501aa2af0f13cfdf522fa465af Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Wed, 4 Mar 2020 11:46:30 +0100 Subject: [PATCH 25/28] added monitor key for ReduceLROnPlateau --- pytorch_lightning/trainer/trainer.py | 5 +++-- pytorch_lightning/trainer/training_loop.py | 12 +++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 8ffc0d2d087c0..f6561b9c39a1e 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1075,7 +1075,8 @@ def configure_schedulers(self, schedulers: list): lr_schedulers = [] default_config = {'interval': 'epoch', # default every epoch 'frequency': 1, # default every epoch/batch - 'reduce_on_plateau': False} # most often not ReduceLROnPlateau scheduler + 'reduce_on_plateau': False, # most often not ReduceLROnPlateau scheduler + 'monitor': 'val_loss'} # default value to monitor for ReduceLROnPlateau for scheduler in schedulers: if isinstance(scheduler, dict): if 'scheduler' not in scheduler: @@ -1083,7 +1084,7 @@ def configure_schedulers(self, schedulers: list): ' with item being a lr scheduler') scheduler['reduce_on_plateau'] = \ isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau) - + lr_schedulers.append({**default_config, **scheduler}) elif isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau): diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index b355cd8fa7f3e..ced1a28c8189c 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -715,12 +715,14 @@ def update_learning_rates(self, interval): if lr_scheduler['interval'] == interval and current_idx % lr_scheduler['frequency'] == 0: # If instance of ReduceLROnPlateau, we need to pass validation loss if lr_scheduler['reduce_on_plateau']: - val_loss = self.callback_metrics.get('val_loss') - if val_loss is None: + monitor_key = lr_scheduler['monitor'] + monitor_val = self.callback_metrics.get(monitor_key) + if monitor_val is None: avail_metrics = ','.join(list(self.callback_metrics.keys())) - m = f'ReduceLROnPlateau conditioned on metric val_loss ' \ - f'which is not available. Available metrics are: {avail_metrics}' + m = f'ReduceLROnPlateau conditioned on metric {monitor_key} ' \ + f'which is not available. Available metrics are: {avail_metrics}. ' \ + 'Condition can be set using `monitor` key in lr scheduler dict' raise MisconfigurationException(m) - lr_scheduler['scheduler'].step(val_loss) + lr_scheduler['scheduler'].step(monitor_val) else: lr_scheduler['scheduler'].step() From bf4c2bbb9a64f1e20b72ddc64b607661f4aaa535 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 5 Mar 2020 00:01:18 -0500 Subject: [PATCH 26/28] Update trainer.py --- pytorch_lightning/trainer/trainer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index fd2d3f9164442..dae535306344b 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1106,8 +1106,8 @@ def configure_schedulers(self, schedulers: list): lr_schedulers = [] default_config = {'interval': 'epoch', # default every epoch 'frequency': 1, # default every epoch/batch - 'reduce_on_plateau': False, # most often not ReduceLROnPlateau scheduler - 'monitor': 'val_loss'} # default value to monitor for ReduceLROnPlateau + 'reduce_on_plateau': False, # most often not ReduceLROnPlateau scheduler + 'monitor': 'val_loss'} # default value to monitor for ReduceLROnPlateau for scheduler in schedulers: if isinstance(scheduler, dict): if 'scheduler' not in scheduler: @@ -1115,7 +1115,7 @@ def configure_schedulers(self, schedulers: list): ' with item being a lr scheduler') scheduler['reduce_on_plateau'] = \ isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau) - + lr_schedulers.append({**default_config, **scheduler}) elif isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau): From 383ed9a27e2654e7e61e85f53d54541958b7f454 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 5 Mar 2020 00:03:11 -0500 Subject: [PATCH 27/28] Update training_loop.py --- pytorch_lightning/trainer/training_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 545f851457bf7..fb76f3ac9b4c6 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -731,7 +731,7 @@ def update_learning_rates(self, interval): avail_metrics = ','.join(list(self.callback_metrics.keys())) m = f'ReduceLROnPlateau conditioned on metric {monitor_key} ' \ f'which is not available. Available metrics are: {avail_metrics}. ' \ - 'Condition can be set using `monitor` key in lr scheduler dict' + 'Condition can be set using `monitor` key in lr scheduler dict' raise MisconfigurationException(m) lr_scheduler['scheduler'].step(monitor_val) else: From 1f42822a21e948faff650722f38bc2761f5ac6e8 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Thu, 5 Mar 2020 09:26:49 +0100 Subject: [PATCH 28/28] fix test after introducing monitor keyword --- tests/test_gpu_models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_gpu_models.py b/tests/test_gpu_models.py index 08af57ccab85a..47cd69b521cc2 100644 --- a/tests/test_gpu_models.py +++ b/tests/test_gpu_models.py @@ -122,7 +122,8 @@ def test_optimizer_return_options(): assert len(optim) == 1 and len(lr_sched) == 1 assert optim[0] == opts[0][0] and \ lr_sched[0] == dict(scheduler=scheduler, interval='epoch', - frequency=1, reduce_on_plateau=False) + frequency=1, reduce_on_plateau=False, + monitor='val_loss') def test_cpu_slurm_save_load(tmpdir):