diff --git a/examples/asr/jasper.py b/examples/asr/jasper.py index d58b87b73fa2..4e1dd4b3c4be 100644 --- a/examples/asr/jasper.py +++ b/examples/asr/jasper.py @@ -7,7 +7,7 @@ from ruamel.yaml import YAML import nemo -from nemo.utils.lr_policies import SquareAnnealing +from nemo.utils.lr_policies import CosineAnnealing import nemo.utils.argparse as nm_argparse import nemo_asr from nemo_asr.helpers import monitor_asr_train_progress, \ @@ -30,9 +30,10 @@ def parse_args(): ) # Overwrite default args - parser.add_argument("--num_epochs", type=int, default=None, required=True, - help="number of epochs to train. You should specify" - "either num_epochs or max_steps") + parser.add_argument("--max_steps", type=int, default=None, required=False, + help="max number of steps to train") + parser.add_argument("--num_epochs", type=int, default=None, required=False, + help="number of epochs to train") parser.add_argument("--model_config", type=str, required=True, help="model configuration file: model.yaml") @@ -43,21 +44,30 @@ def parse_args(): parser.add_argument("--warmup_steps", default=0, type=int) args = parser.parse_args() - if args.max_steps is not None: - raise ValueError("Jasper uses num_epochs instead of max_steps") + if args.max_steps is not None and args.num_epochs is not None: + raise ValueError("Either max_steps or num_epochs should be provided.") return args -def construct_name(name, lr, batch_size, num_epochs, wd, optimizer, +def construct_name(name, lr, batch_size, max_steps, num_epochs, wd, optimizer, iter_per_step): - return ("{0}-lr_{1}-bs_{2}-e_{3}-wd_{4}-opt_{5}-ips_{6}".format( - name, lr, - batch_size, - num_epochs, - wd, - optimizer, - iter_per_step)) + if max_steps is not None: + return ("{0}-lr_{1}-bs_{2}-s_{3}-wd_{4}-opt_{5}-ips_{6}".format( + name, lr, + batch_size, + max_steps, + wd, + optimizer, + iter_per_step)) + else: + return ("{0}-lr_{1}-bs_{2}-e_{3}-wd_{4}-opt_{5}-ips_{6}".format( + name, lr, + batch_size, + num_epochs, + wd, + optimizer, + iter_per_step)) def create_all_dags(args, neural_factory): @@ -241,6 +251,7 @@ def main(): args.exp_name, args.lr, args.batch_size, + args.max_steps, args.num_epochs, args.weight_decay, args.optimizer, @@ -275,11 +286,14 @@ def main(): neural_factory.train( tensors_to_optimize=[train_loss], callbacks=callbacks, - lr_policy=SquareAnnealing(args.num_epochs * steps_per_epoch, - warmup_steps=args.warmup_steps), + lr_policy=CosineAnnealing( + args.max_steps if args.max_steps is not None else + args.num_epochs * steps_per_epoch, + warmup_steps=args.warmup_steps), optimizer=args.optimizer, optimization_params={ "num_epochs": args.num_epochs, + "max_steps": args.max_steps, "lr": args.lr, "betas": ( args.beta1, diff --git a/nemo/nemo/backends/pytorch/actions.py b/nemo/nemo/backends/pytorch/actions.py index 537ae581ebef..7cf69e40f2ce 100644 --- a/nemo/nemo/backends/pytorch/actions.py +++ b/nemo/nemo/backends/pytorch/actions.py @@ -929,8 +929,10 @@ def train(self, stop_on_nan_loss=False): if not optimization_params: optimization_params = {} - num_epochs = optimization_params.get("num_epochs", 1) + num_epochs = optimization_params.get("num_epochs", None) max_steps = optimization_params.get("max_steps", None) + if num_epochs is None and max_steps is None: + raise ValueError("You must specify either max_steps or num_epochs") grad_norm_clip = optimization_params.get('grad_norm_clip', None) if batches_per_step is None: @@ -1108,8 +1110,8 @@ def train(self, # MAIN TRAINING LOOP # iteration over epochs - for epoch_ind in range(self.epoch_num, num_epochs): - self.epoch_num = epoch_ind + self.epoch_num = 0 + while num_epochs is None or self.epoch_num < num_epochs: if train_sampler is not None: train_sampler.set_epoch(self.epoch_num) if max_steps is not None and self.step >= max_steps: @@ -1230,9 +1232,9 @@ def train(self, self._perform_on_iteration_end(callbacks=callbacks) self.step += 1 # End of epoch for loop - # Register epochs end with callbacks self._perform_on_epoch_end(callbacks=callbacks) + self.epoch_num += 1 self._perform_on_action_end(callbacks=callbacks) def infer(self, diff --git a/nemo/nemo/core/callbacks.py b/nemo/nemo/core/callbacks.py index ee08956c20e1..2a4e457e4355 100644 --- a/nemo/nemo/core/callbacks.py +++ b/nemo/nemo/core/callbacks.py @@ -2,7 +2,6 @@ from abc import ABC, abstractmethod from collections import namedtuple import glob -import math import os import sys import time diff --git a/nemo/nemo/core/neural_factory.py b/nemo/nemo/core/neural_factory.py index 0293e0034049..2bbdca196929 100644 --- a/nemo/nemo/core/neural_factory.py +++ b/nemo/nemo/core/neural_factory.py @@ -547,7 +547,8 @@ def eval(self, self.train( tensors_to_optimize=None, optimizer='sgd', - callbacks=callbacks + callbacks=callbacks, + optimization_params={'num_epochs': 1} ) def infer(self, tensors: List[NmTensor], checkpoint_dir=None, diff --git a/tests/test_neural_factory.py b/tests/test_neural_factory.py index 394901492cc7..e6b0a7f13a3e 100644 --- a/tests/test_neural_factory.py +++ b/tests/test_neural_factory.py @@ -17,7 +17,6 @@ def test_creation(self): instance, nemo.backends.pytorch.tutorials.TaylorNet)) def test_simple_example(self): - ####################################################################### neural_factory = nemo.core.neural_factory.NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=None, @@ -36,5 +35,5 @@ def test_simple_example(self): optimizer = neural_factory.get_trainer() optimizer.train([loss_tensor], optimizer="sgd", - optimization_params={"lr": 1e-3}) - ####################################################################### + optimization_params={"lr": 1e-3, + "num_epochs": 1})