diff --git a/egs/ami/s5/local/chain/run_blstm_ami_5.sh b/egs/ami/s5/local/chain/run_blstm_ami_5.sh index d9437af7e0c..5943494d8e1 100755 --- a/egs/ami/s5/local/chain/run_blstm_ami_5.sh +++ b/egs/ami/s5/local/chain/run_blstm_ami_5.sh @@ -118,7 +118,6 @@ if [ $stage -le 17 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -129,6 +128,7 @@ if [ $stage -le 17 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh index 79d633b1ebd..522498d847d 100755 --- a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh +++ b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh @@ -176,7 +176,6 @@ if [ $stage -le 12 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.max-param-change 1.414 \ --egs.stage $get_egs_stage \ @@ -193,6 +192,7 @@ if [ $stage -le 12 ]; then --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.shrink-value 0.99 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --cleanup.remove-egs $remove_egs \ --feat-dir data/train_rvb_min${min_seg_len}_hires \ --tree-dir $treedir \ diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh index 5fa4ea565cd..c11420e5cfd 100755 --- a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh +++ b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh @@ -173,7 +173,6 @@ if [ $stage -le 12 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.max-param-change 1.414 \ --egs.stage $get_egs_stage \ @@ -188,6 +187,7 @@ if [ $stage -le 12 ]; then --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.shrink-value 0.99 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --cleanup.remove-egs $remove_egs \ --feat-dir data/train_rvb_min${min_seg_len}_hires \ --tree-dir $treedir \ diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh index b70da4e852a..a48e7ed55af 100644 --- a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh +++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh @@ -117,7 +117,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 1.414 \ @@ -128,6 +127,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/multi_en/s5/local/chain/run_blstm_6h.sh b/egs/multi_en/s5/local/chain/run_blstm_6h.sh index 51ca7db0495..5a68947282a 100644 --- a/egs/multi_en/s5/local/chain/run_blstm_6h.sh +++ b/egs/multi_en/s5/local/chain/run_blstm_6h.sh @@ -124,7 +124,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 1.414 \ @@ -135,6 +134,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh index 95f7aef2708..9ab72b40ac2 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh @@ -144,7 +144,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -155,6 +154,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh index 26cdaed29d7..6e1712c5187 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh @@ -150,7 +150,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -161,6 +160,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh index fbced146199..3155e21b618 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh @@ -149,7 +149,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -160,6 +159,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh index c5548cbfa5c..f1a42cc175c 100644 --- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh @@ -149,7 +149,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -160,6 +159,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh index 28c20c92ab0..a678fe22044 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh @@ -155,7 +155,6 @@ if [ $stage -le 13 ]; then --chain.xent-regularize $xent_regularize \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.max-param-change 2.0 \ --trainer.num-epochs 4 \ @@ -165,6 +164,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts="--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index dce77d007ef..5f3a25038d1 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -118,11 +118,7 @@ def GetArgs(): " chain model's output") parser.add_argument("--chain.left-deriv-truncate", type=int, dest='left_deriv_truncate', - default = None, help="") - parser.add_argument("--chain.right-deriv-truncate", type=int, - dest='right_deriv_truncate', - default = None, help="") - + default = None, help="Deprecated. Kept for back compatibility") # trainer options parser.add_argument("--trainer.srand", type=int, dest='srand', @@ -224,6 +220,12 @@ def GetArgs(): parser.add_argument("--trainer.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch', default=512, help="Number of sequences to be processed in parallel every minibatch" ) + parser.add_argument("--trainer.deriv-truncate-margin", type=int, dest='deriv_truncate_margin', + default = None, + help="If specified, it is the number of frames that the derivative will be backpropagated through the chunk boundaries, " + "e.g., During BLSTM model training if the chunk-width=150 and deriv-truncate-margin=5, then the derivative will be " + "backpropagated up to t=-5 and t=154 in the forward and backward LSTM sequence respectively; " + "otherwise, the derivative will be backpropagated to the end of the sequence.") # General options parser.add_argument("--stage", type=int, default=-4, @@ -284,6 +286,12 @@ def ProcessArgs(args): if args.chunk_right_context < 0: raise Exception("--egs.chunk-right-context should be non-negative") + if not args.left_deriv_truncate is None: + args.deriv_truncate_margin = -args.left_deriv_truncate + logger.warning("--chain.left-deriv-truncate (deprecated) is set by user, " + "and --trainer.deriv-truncate-margin is set to negative of that value={0}. " + "We recommend using the option --trainer.deriv-truncate-margin.".format(args.deriv_truncate_margin)) + if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")): raise Exception("""This scripts expects {0} to exist and have a configs directory which is the output of make_configs.py script""") @@ -327,7 +335,7 @@ def __init__(self): def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives, raw_model_string, egs_dir, left_context, right_context, apply_deriv_weights, - left_deriv_truncate, right_deriv_truncate, + min_deriv_time, max_deriv_time, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, num_chunk_per_minibatch, @@ -340,10 +348,10 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi # but we use the same script for consistency with FF-DNN code deriv_time_opts="" - if left_deriv_truncate is not None: - deriv_time_opts += " --optimization.min-deriv-time={0}".format(left_deriv_truncate) - if right_deriv_truncate is not None: - deriv_time_opts += " --optimization.max-deriv-time={0}".format(int(chunk-width-right_deriv_truncate)) + if not min_deriv_time is None: + deriv_time_opts += " --optimization.min-deriv-time={0}".format(min_deriv_time) + if not max_deriv_time is None: + deriv_time_opts += " --optimization.max-deriv-time={0}".format(max_deriv_time) processes = [] for job in range(1,num_jobs+1): @@ -406,7 +414,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir, learning_rate, shrinkage_value, num_chunk_per_minibatch, num_hidden_layers, add_layers_period, left_context, right_context, - apply_deriv_weights, left_deriv_truncate, right_deriv_truncate, + apply_deriv_weights, min_deriv_time, max_deriv_time, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, frame_subsampling_factor, truncate_deriv_weights, @@ -480,8 +488,8 @@ def TrainOneIteration(dir, iter, srand, egs_dir, left_context = left_context, right_context = right_context, apply_deriv_weights = apply_deriv_weights, - left_deriv_truncate = left_deriv_truncate, - right_deriv_truncate = right_deriv_truncate, + min_deriv_time = min_deriv_time, + max_deriv_time = max_deriv_time, l2_regularize = l2_regularize, xent_regularize = xent_regularize, leaky_hmm_coefficient = leaky_hmm_coefficient, @@ -662,6 +670,12 @@ def Train(args, run_opts): args.initial_effective_lrate, args.final_effective_lrate) + min_deriv_time = None + max_deriv_time = None + if not args.deriv_truncate_margin is None: + min_deriv_time = -args.deriv_truncate_margin + max_deriv_time = args.chunk_width - 1 + args.deriv_truncate_margin + logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters)) for iter in range(num_iters): if (args.exit_stage is not None) and (iter == args.exit_stage): @@ -692,8 +706,8 @@ def Train(args, run_opts): left_context = left_context, right_context = right_context, apply_deriv_weights = args.apply_deriv_weights, - left_deriv_truncate = args.left_deriv_truncate, - right_deriv_truncate = args.right_deriv_truncate, + min_deriv_time = min_deriv_time, + max_deriv_time = max_deriv_time, l2_regularize = args.l2_regularize, xent_regularize = args.xent_regularize, leaky_hmm_coefficient = args.leaky_hmm_coefficient, diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py index 0853aa81aff..89db4276cfc 100755 --- a/egs/wsj/s5/steps/nnet3/train_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_rnn.py @@ -194,7 +194,7 @@ def GetArgs(): help="Number of sequences to be processed in parallel every minibatch" ) parser.add_argument("--trainer.rnn.num-bptt-steps", type=int, dest='num_bptt_steps', default=None, - help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." ) + help="The number of time steps to back-propagate from the last label in the chunk. By default it is set to (chunk-width + 10)." ) # General options parser.add_argument("--stage", type=int, default=-4, @@ -346,7 +346,7 @@ def __init__(self): def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives, raw_model_string, egs_dir, - left_context, right_context, min_deriv_time, + left_context, right_context, min_deriv_time, max_deriv_time, momentum, max_param_change, shuffle_buffer_size, num_chunk_per_minibatch, cache_read_opt, run_opts): @@ -375,7 +375,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \ --print-interval=10 --momentum={momentum} \ --max-param-change={max_param_change} \ - --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \ + --optimization.min-deriv-time={min_deriv_time} --optimization.max-deriv-time={max_deriv_time} "{raw_model}" \ "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \ {dir}/{next_iter}.{job}.raw """.format(command = run_opts.command, @@ -384,7 +384,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi parallel_train_opts = run_opts.parallel_train_opts, cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt, momentum = momentum, max_param_change = max_param_change, - min_deriv_time = min_deriv_time, + min_deriv_time = min_deriv_time, max_deriv_time = max_deriv_time, raw_model = raw_model_string, context_opts = context_opts, egs_dir = egs_dir, archive_index = archive_index, shuffle_buffer_size = shuffle_buffer_size, @@ -409,7 +409,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, learning_rate, shrinkage_value, num_chunk_per_minibatch, num_hidden_layers, add_layers_period, - left_context, right_context, min_deriv_time, + left_context, right_context, min_deriv_time, max_deriv_time, momentum, max_param_change, shuffle_buffer_size, cv_minibatch_size, run_opts): # Set off jobs doing some diagnostics, in the background. @@ -490,6 +490,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir, left_context = left_context, right_context = right_context, min_deriv_time = min_deriv_time, + max_deriv_time = max_deriv_time, momentum = momentum, max_param_change = max_param_change, shuffle_buffer_size = shuffle_buffer_size, @@ -650,11 +651,13 @@ def Train(args, run_opts): cur_egs_dir=egs_dir if args.num_bptt_steps is None: - num_bptt_steps = args.chunk_width + # num_bptt_steps is set to (chunk_width + 10) by default + num_bptt_steps = args.chunk_width + min(10, args.chunk_left_context, args.chunk_right_context) else: num_bptt_steps = args.num_bptt_steps min_deriv_time = args.chunk_width - num_bptt_steps + max_deriv_time = num_bptt_steps - 1 logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters)) @@ -695,6 +698,7 @@ def Train(args, run_opts): left_context = left_context, right_context = right_context, min_deriv_time = min_deriv_time, + max_deriv_time = max_deriv_time, momentum = args.momentum, max_param_change= args.max_param_change, shuffle_buffer_size = args.shuffle_buffer_size,