From 0827ffc8c1ac84aa3d413db4a694050ab6bd8e80 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Thu, 17 Oct 2019 16:04:48 -0700 Subject: [PATCH 1/6] adjust jasper.py for experiments Signed-off-by: Oleksii Kuchaiev --- examples/asr/jasper.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/examples/asr/jasper.py b/examples/asr/jasper.py index d58b87b73fa2..6b22ece10464 100644 --- a/examples/asr/jasper.py +++ b/examples/asr/jasper.py @@ -7,7 +7,7 @@ from ruamel.yaml import YAML import nemo -from nemo.utils.lr_policies import SquareAnnealing +from nemo.utils.lr_policies import CosineAnnealing import nemo.utils.argparse as nm_argparse import nemo_asr from nemo_asr.helpers import monitor_asr_train_progress, \ @@ -30,9 +30,8 @@ def parse_args(): ) # Overwrite default args - parser.add_argument("--num_epochs", type=int, default=None, required=True, - help="number of epochs to train. You should specify" - "either num_epochs or max_steps") + parser.add_argument("--max_steps", type=int, required=True, default=None, + help="max number of steps to train") parser.add_argument("--model_config", type=str, required=True, help="model configuration file: model.yaml") @@ -43,18 +42,15 @@ def parse_args(): parser.add_argument("--warmup_steps", default=0, type=int) args = parser.parse_args() - if args.max_steps is not None: - raise ValueError("Jasper uses num_epochs instead of max_steps") - return args -def construct_name(name, lr, batch_size, num_epochs, wd, optimizer, +def construct_name(name, lr, batch_size, max_steps, wd, optimizer, iter_per_step): - return ("{0}-lr_{1}-bs_{2}-e_{3}-wd_{4}-opt_{5}-ips_{6}".format( + return ("{0}-lr_{1}-bs_{2}-s_{3}-wd_{4}-opt_{5}-ips_{6}".format( name, lr, batch_size, - num_epochs, + max_steps, wd, optimizer, iter_per_step)) @@ -241,7 +237,7 @@ def main(): args.exp_name, args.lr, args.batch_size, - args.num_epochs, + args.max_steps, args.weight_decay, args.optimizer, args.iter_per_step) @@ -275,11 +271,11 @@ def main(): neural_factory.train( tensors_to_optimize=[train_loss], callbacks=callbacks, - lr_policy=SquareAnnealing(args.num_epochs * steps_per_epoch, + lr_policy=CosineAnnealing(args.max_steps, warmup_steps=args.warmup_steps), optimizer=args.optimizer, optimization_params={ - "num_epochs": args.num_epochs, + "max_steps": args.max_steps, "lr": args.lr, "betas": ( args.beta1, From 497c4adba8538b68e8df39c1280e784af8994697 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Fri, 18 Oct 2019 10:27:44 -0700 Subject: [PATCH 2/6] check bug when max_steps can't go above 1 epoch Signed-off-by: Oleksii Kuchaiev --- nemo/nemo/backends/pytorch/actions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nemo/nemo/backends/pytorch/actions.py b/nemo/nemo/backends/pytorch/actions.py index e7cc421703c7..8e4601827018 100644 --- a/nemo/nemo/backends/pytorch/actions.py +++ b/nemo/nemo/backends/pytorch/actions.py @@ -905,7 +905,7 @@ def train(self, stop_on_nan_loss=False): if not optimization_params: optimization_params = {} - num_epochs = optimization_params.get("num_epochs", 1) + num_epochs = optimization_params.get("num_epochs", None) max_steps = optimization_params.get("max_steps", None) grad_norm_clip = optimization_params.get('grad_norm_clip', None) @@ -1084,8 +1084,8 @@ def train(self, # MAIN TRAINING LOOP # iteration over epochs - for epoch_ind in range(self.epoch_num, num_epochs): - self.epoch_num = epoch_ind + self.epoch_num = 0 + while num_epochs is None or self.epoch_num < num_epochs: if train_sampler is not None: train_sampler.set_epoch(self.epoch_num) if max_steps is not None and self.step >= max_steps: @@ -1200,9 +1200,9 @@ def train(self, self._perform_on_iteration_end(callbacks=callbacks) self.step += 1 # End of epoch for loop - # Register epochs end with callbacks self._perform_on_epoch_end(callbacks=callbacks) + self.epoch_num += 1 self._perform_on_action_end(callbacks=callbacks) def infer(self, From 93883517775203a271b25f4adfe9670989fb8da6 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Fri, 18 Oct 2019 10:38:27 -0700 Subject: [PATCH 3/6] adjust Jasper example script Signed-off-by: Oleksii Kuchaiev --- examples/asr/jasper.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/asr/jasper.py b/examples/asr/jasper.py index 6b22ece10464..367350feae8a 100644 --- a/examples/asr/jasper.py +++ b/examples/asr/jasper.py @@ -30,8 +30,10 @@ def parse_args(): ) # Overwrite default args - parser.add_argument("--max_steps", type=int, required=True, default=None, + parser.add_argument("--max_steps", type=int, default=None, required=False, help="max number of steps to train") + parser.add_argument("--num_epochs", type=int, default=None, required=False, + help="number of epochs to train") parser.add_argument("--model_config", type=str, required=True, help="model configuration file: model.yaml") @@ -42,6 +44,9 @@ def parse_args(): parser.add_argument("--warmup_steps", default=0, type=int) args = parser.parse_args() + + if args.max_steps is not None and args.num_epochs is not None: + raise ValueError("Either max_steps or num_epochs should be provided.") return args @@ -275,6 +280,7 @@ def main(): warmup_steps=args.warmup_steps), optimizer=args.optimizer, optimization_params={ + "num_epochs": args.num_epochs, "max_steps": args.max_steps, "lr": args.lr, "betas": ( From ecaed8468c5945eff8a97a73ede8962947b6ad75 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Fri, 18 Oct 2019 11:04:07 -0700 Subject: [PATCH 4/6] fix steps_per epochs Signed-off-by: Oleksii Kuchaiev --- examples/asr/jasper.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/asr/jasper.py b/examples/asr/jasper.py index 367350feae8a..9abc49a0b7a9 100644 --- a/examples/asr/jasper.py +++ b/examples/asr/jasper.py @@ -276,8 +276,10 @@ def main(): neural_factory.train( tensors_to_optimize=[train_loss], callbacks=callbacks, - lr_policy=CosineAnnealing(args.max_steps, - warmup_steps=args.warmup_steps), + lr_policy=CosineAnnealing( + args.max_steps if args.max_steps is not None else + args.num_epochs * steps_per_epoch, + warmup_steps=args.warmup_steps), optimizer=args.optimizer, optimization_params={ "num_epochs": args.num_epochs, From 5d1625201a0595741466aafdb63f44039de0d7bd Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Fri, 18 Oct 2019 16:01:02 -0700 Subject: [PATCH 5/6] bugfix Signed-off-by: Oleksii Kuchaiev --- nemo/nemo/backends/pytorch/actions.py | 2 ++ nemo/nemo/core/neural_factory.py | 3 ++- tests/test_neural_factory.py | 5 ++--- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/nemo/nemo/backends/pytorch/actions.py b/nemo/nemo/backends/pytorch/actions.py index 8e4601827018..7ad6ff3cb251 100644 --- a/nemo/nemo/backends/pytorch/actions.py +++ b/nemo/nemo/backends/pytorch/actions.py @@ -907,6 +907,8 @@ def train(self, optimization_params = {} num_epochs = optimization_params.get("num_epochs", None) max_steps = optimization_params.get("max_steps", None) + if num_epochs is None and max_steps is None: + raise ValueError("You must specify either max_steps or num_epochs") grad_norm_clip = optimization_params.get('grad_norm_clip', None) if batches_per_step is None: diff --git a/nemo/nemo/core/neural_factory.py b/nemo/nemo/core/neural_factory.py index db38aebc4a61..e316086736bb 100644 --- a/nemo/nemo/core/neural_factory.py +++ b/nemo/nemo/core/neural_factory.py @@ -537,7 +537,8 @@ def eval(self, self.train( tensors_to_optimize=None, optimizer='sgd', - callbacks=callbacks + callbacks=callbacks, + optimization_params={'num_epochs': 1} ) def infer(self, tensors: List[NmTensor], checkpoint_dir=None, diff --git a/tests/test_neural_factory.py b/tests/test_neural_factory.py index 394901492cc7..e6b0a7f13a3e 100644 --- a/tests/test_neural_factory.py +++ b/tests/test_neural_factory.py @@ -17,7 +17,6 @@ def test_creation(self): instance, nemo.backends.pytorch.tutorials.TaylorNet)) def test_simple_example(self): - ####################################################################### neural_factory = nemo.core.neural_factory.NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=None, @@ -36,5 +35,5 @@ def test_simple_example(self): optimizer = neural_factory.get_trainer() optimizer.train([loss_tensor], optimizer="sgd", - optimization_params={"lr": 1e-3}) - ####################################################################### + optimization_params={"lr": 1e-3, + "num_epochs": 1}) From c78aa6be778bbdd9dbeed508be10257151637fa6 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Mon, 21 Oct 2019 15:26:39 -0700 Subject: [PATCH 6/6] fix naming issue Signed-off-by: Oleksii Kuchaiev --- examples/asr/jasper.py | 26 ++++++++++++++++++-------- nemo/nemo/core/callbacks.py | 1 - 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/examples/asr/jasper.py b/examples/asr/jasper.py index 9abc49a0b7a9..4e1dd4b3c4be 100644 --- a/examples/asr/jasper.py +++ b/examples/asr/jasper.py @@ -50,15 +50,24 @@ def parse_args(): return args -def construct_name(name, lr, batch_size, max_steps, wd, optimizer, +def construct_name(name, lr, batch_size, max_steps, num_epochs, wd, optimizer, iter_per_step): - return ("{0}-lr_{1}-bs_{2}-s_{3}-wd_{4}-opt_{5}-ips_{6}".format( - name, lr, - batch_size, - max_steps, - wd, - optimizer, - iter_per_step)) + if max_steps is not None: + return ("{0}-lr_{1}-bs_{2}-s_{3}-wd_{4}-opt_{5}-ips_{6}".format( + name, lr, + batch_size, + max_steps, + wd, + optimizer, + iter_per_step)) + else: + return ("{0}-lr_{1}-bs_{2}-e_{3}-wd_{4}-opt_{5}-ips_{6}".format( + name, lr, + batch_size, + num_epochs, + wd, + optimizer, + iter_per_step)) def create_all_dags(args, neural_factory): @@ -243,6 +252,7 @@ def main(): args.lr, args.batch_size, args.max_steps, + args.num_epochs, args.weight_decay, args.optimizer, args.iter_per_step) diff --git a/nemo/nemo/core/callbacks.py b/nemo/nemo/core/callbacks.py index ee08956c20e1..2a4e457e4355 100644 --- a/nemo/nemo/core/callbacks.py +++ b/nemo/nemo/core/callbacks.py @@ -2,7 +2,6 @@ from abc import ABC, abstractmethod from collections import namedtuple import glob -import math import os import sys import time