From 0482e82d63a299acbe3be3a2384b78341db219a8 Mon Sep 17 00:00:00 2001 From: freewym Date: Tue, 1 Nov 2016 17:08:33 -0400 Subject: [PATCH 1/2] fixed max_deriv_time unset issue for BLSTM --- egs/ami/s5/local/chain/run_blstm_ami_5.sh | 2 +- .../s5/local/chain/tuning/run_blstm_7b.sh | 2 +- .../s5/local/chain/tuning/run_blstm_asp_1.sh | 2 +- .../s5/local/chain/run_blstm_6h.sh | 2 +- egs/multi_en/s5/local/chain/run_blstm_6h.sh | 2 +- .../s5c/local/chain/tuning/run_blstm_6h.sh | 2 +- .../s5c/local/chain/tuning/run_blstm_6i.sh | 2 +- .../s5c/local/chain/tuning/run_lstm_6h.sh | 2 +- .../s5c/local/chain/tuning/run_lstm_6i.sh | 2 +- egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh | 2 +- egs/wsj/s5/steps/nnet3/chain/train.py | 36 ++++++++++++------- egs/wsj/s5/steps/nnet3/train_rnn.py | 13 ++++--- 12 files changed, 41 insertions(+), 28 deletions(-) diff --git a/egs/ami/s5/local/chain/run_blstm_ami_5.sh b/egs/ami/s5/local/chain/run_blstm_ami_5.sh index d9437af7e0c..c15f9e7fd6f 100755 --- a/egs/ami/s5/local/chain/run_blstm_ami_5.sh +++ b/egs/ami/s5/local/chain/run_blstm_ami_5.sh @@ -118,7 +118,6 @@ if [ $stage -le 17 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -129,6 +128,7 @@ if [ $stage -le 17 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 0 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh index 79d633b1ebd..71517fac737 100755 --- a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh +++ b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh @@ -176,7 +176,6 @@ if [ $stage -le 12 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.max-param-change 1.414 \ --egs.stage $get_egs_stage \ @@ -193,6 +192,7 @@ if [ $stage -le 12 ]; then --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.shrink-value 0.99 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 0 \ --cleanup.remove-egs $remove_egs \ --feat-dir data/train_rvb_min${min_seg_len}_hires \ --tree-dir $treedir \ diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh index 5fa4ea565cd..62bc5d89b6c 100755 --- a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh +++ b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh @@ -173,7 +173,6 @@ if [ $stage -le 12 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.max-param-change 1.414 \ --egs.stage $get_egs_stage \ @@ -188,6 +187,7 @@ if [ $stage -le 12 ]; then --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.shrink-value 0.99 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 0 \ --cleanup.remove-egs $remove_egs \ --feat-dir data/train_rvb_min${min_seg_len}_hires \ --tree-dir $treedir \ diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh index b70da4e852a..6865a739e43 100644 --- a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh +++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh @@ -117,7 +117,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 1.414 \ @@ -128,6 +127,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 0 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/multi_en/s5/local/chain/run_blstm_6h.sh b/egs/multi_en/s5/local/chain/run_blstm_6h.sh index 51ca7db0495..81a0cd87463 100644 --- a/egs/multi_en/s5/local/chain/run_blstm_6h.sh +++ b/egs/multi_en/s5/local/chain/run_blstm_6h.sh @@ -124,7 +124,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 1.414 \ @@ -135,6 +134,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 0 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh index 95f7aef2708..1aa2d12ecb9 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh @@ -144,7 +144,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -155,6 +154,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 0 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh index 26cdaed29d7..6e1712c5187 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh @@ -150,7 +150,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -161,6 +160,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh index fbced146199..4a29093c1c5 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh @@ -149,7 +149,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -160,6 +159,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 0 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh index c5548cbfa5c..85b4e3a1ff3 100644 --- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh @@ -149,7 +149,6 @@ if [ $stage -le 13 ]; then --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.frames-per-iter 1200000 \ --trainer.max-param-change 2.0 \ @@ -160,6 +159,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 0 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh index 28c20c92ab0..afd5a3e6eb2 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh @@ -155,7 +155,6 @@ if [ $stage -le 13 ]; then --chain.xent-regularize $xent_regularize \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ - --chain.left-deriv-truncate 0 \ --trainer.num-chunk-per-minibatch 64 \ --trainer.max-param-change 2.0 \ --trainer.num-epochs 4 \ @@ -165,6 +164,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 0 \ --egs.stage $get_egs_stage \ --egs.opts="--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index dce77d007ef..34bfc7376fc 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -118,11 +118,7 @@ def GetArgs(): " chain model's output") parser.add_argument("--chain.left-deriv-truncate", type=int, dest='left_deriv_truncate', - default = None, help="") - parser.add_argument("--chain.right-deriv-truncate", type=int, - dest='right_deriv_truncate', - default = None, help="") - + default = None, help="Deprecated. Kept for back compatibility") # trainer options parser.add_argument("--trainer.srand", type=int, dest='srand', @@ -224,6 +220,12 @@ def GetArgs(): parser.add_argument("--trainer.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch', default=512, help="Number of sequences to be processed in parallel every minibatch" ) + parser.add_argument("--trainer.deriv-truncate-margin", type=int, dest='deriv_truncate_margin', + default = None, + help="If specified, it is the number of frames that the derivative will be backpropagated through the chunk boundaries, " + "e.g., During BLSTM model training if the chunk-width=150 and deriv-truncate-margin=5, then the derivative will be " + "backpropagated up to t=-5 and t=154 in the forward and backward LSTM sequence respectively; " + "otherwise, the derivative will be backpropagated to the end of the sequence.") # General options parser.add_argument("--stage", type=int, default=-4, @@ -284,6 +286,12 @@ def ProcessArgs(args): if args.chunk_right_context < 0: raise Exception("--egs.chunk-right-context should be non-negative") + if not args.left_deriv_truncate is None: + args.deriv_truncate_margin = -args.left_deriv_truncate + logger.warning("--chain.left-deriv-truncate (deprecated) is set by user, " + "and --trainer.deriv-truncate-margin is set to negative of that value={0}. " + "We recommend using the option --trainer.deriv-truncate-margin.".format(args.deriv_truncate_margin)) + if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")): raise Exception("""This scripts expects {0} to exist and have a configs directory which is the output of make_configs.py script""") @@ -330,7 +338,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi left_deriv_truncate, right_deriv_truncate, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, - shuffle_buffer_size, num_chunk_per_minibatch, + shuffle_buffer_size, chunk_width, num_chunk_per_minibatch, frame_subsampling_factor, truncate_deriv_weights, cache_io_opts, run_opts): # We cannot easily use a single parallel SGE job to do the main training, @@ -340,10 +348,10 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi # but we use the same script for consistency with FF-DNN code deriv_time_opts="" - if left_deriv_truncate is not None: - deriv_time_opts += " --optimization.min-deriv-time={0}".format(left_deriv_truncate) - if right_deriv_truncate is not None: - deriv_time_opts += " --optimization.max-deriv-time={0}".format(int(chunk-width-right_deriv_truncate)) + if not left_deriv_truncate is None: + deriv_time_opts += " --optimization.min-deriv-time={0}".format(-left_deriv_truncate) + if not right_deriv_truncate is None: + deriv_time_opts += " --optimization.max-deriv-time={0}".format(chunk_width - 1 + right_deriv_truncate) processes = [] for job in range(1,num_jobs+1): @@ -403,7 +411,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi def TrainOneIteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, - learning_rate, shrinkage_value, num_chunk_per_minibatch, + learning_rate, shrinkage_value, chunk_width, num_chunk_per_minibatch, num_hidden_layers, add_layers_period, left_context, right_context, apply_deriv_weights, left_deriv_truncate, right_deriv_truncate, @@ -488,6 +496,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir, momentum = momentum, max_param_change = cur_max_param_change, shuffle_buffer_size = shuffle_buffer_size, + chunk_width = chunk_width, num_chunk_per_minibatch = cur_num_chunk_per_minibatch, frame_subsampling_factor = frame_subsampling_factor, truncate_deriv_weights = truncate_deriv_weights, @@ -686,14 +695,15 @@ def Train(args, run_opts): num_archives = num_archives, learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value = shrinkage_value, + chunk_width = args.chunk_width, num_chunk_per_minibatch = args.num_chunk_per_minibatch, num_hidden_layers = num_hidden_layers, add_layers_period = args.add_layers_period, left_context = left_context, right_context = right_context, apply_deriv_weights = args.apply_deriv_weights, - left_deriv_truncate = args.left_deriv_truncate, - right_deriv_truncate = args.right_deriv_truncate, + left_deriv_truncate = args.deriv_truncate_margin, + right_deriv_truncate = args.deriv_truncate_margin, l2_regularize = args.l2_regularize, xent_regularize = args.xent_regularize, leaky_hmm_coefficient = args.leaky_hmm_coefficient, diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py index 0853aa81aff..558192bf026 100755 --- a/egs/wsj/s5/steps/nnet3/train_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_rnn.py @@ -346,7 +346,7 @@ def __init__(self): def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives, raw_model_string, egs_dir, - left_context, right_context, min_deriv_time, + left_context, right_context, min_deriv_time, max_deriv_time, momentum, max_param_change, shuffle_buffer_size, num_chunk_per_minibatch, cache_read_opt, run_opts): @@ -375,7 +375,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \ --print-interval=10 --momentum={momentum} \ --max-param-change={max_param_change} \ - --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \ + --optimization.min-deriv-time={min_deriv_time} --optimization.max-deriv-time={max_deriv_time} "{raw_model}" \ "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \ {dir}/{next_iter}.{job}.raw """.format(command = run_opts.command, @@ -384,7 +384,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi parallel_train_opts = run_opts.parallel_train_opts, cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt, momentum = momentum, max_param_change = max_param_change, - min_deriv_time = min_deriv_time, + min_deriv_time = min_deriv_time, max_deriv_time = max_deriv_time, raw_model = raw_model_string, context_opts = context_opts, egs_dir = egs_dir, archive_index = archive_index, shuffle_buffer_size = shuffle_buffer_size, @@ -409,7 +409,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, learning_rate, shrinkage_value, num_chunk_per_minibatch, num_hidden_layers, add_layers_period, - left_context, right_context, min_deriv_time, + left_context, right_context, min_deriv_time, max_deriv_time, momentum, max_param_change, shuffle_buffer_size, cv_minibatch_size, run_opts): # Set off jobs doing some diagnostics, in the background. @@ -490,6 +490,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir, left_context = left_context, right_context = right_context, min_deriv_time = min_deriv_time, + max_deriv_time = max_deriv_time, momentum = momentum, max_param_change = max_param_change, shuffle_buffer_size = shuffle_buffer_size, @@ -650,11 +651,12 @@ def Train(args, run_opts): cur_egs_dir=egs_dir if args.num_bptt_steps is None: - num_bptt_steps = args.chunk_width + num_bptt_steps = args.chunk_width + min(10, args.chunk_left_context, args.chunk_right_context) else: num_bptt_steps = args.num_bptt_steps min_deriv_time = args.chunk_width - num_bptt_steps + max_deriv_time = num_bptt_steps - 1 logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters)) @@ -695,6 +697,7 @@ def Train(args, run_opts): left_context = left_context, right_context = right_context, min_deriv_time = min_deriv_time, + max_deriv_time = max_deriv_time, momentum = args.momentum, max_param_change= args.max_param_change, shuffle_buffer_size = args.shuffle_buffer_size, From 52fabe5a004280e1a47a8766f715c9ca90a123da Mon Sep 17 00:00:00 2001 From: freewym Date: Wed, 16 Nov 2016 23:29:57 -0500 Subject: [PATCH 2/2] change {left|right}_deriv_truncate to {min|max}_deriv_time in argument lists --- egs/ami/s5/local/chain/run_blstm_ami_5.sh | 2 +- .../s5/local/chain/tuning/run_blstm_7b.sh | 2 +- .../s5/local/chain/tuning/run_blstm_asp_1.sh | 2 +- .../s5/local/chain/run_blstm_6h.sh | 2 +- egs/multi_en/s5/local/chain/run_blstm_6h.sh | 2 +- .../s5c/local/chain/tuning/run_blstm_6h.sh | 2 +- .../s5c/local/chain/tuning/run_lstm_6h.sh | 2 +- .../s5c/local/chain/tuning/run_lstm_6i.sh | 2 +- egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh | 2 +- egs/wsj/s5/steps/nnet3/chain/train.py | 32 +++++++++++-------- egs/wsj/s5/steps/nnet3/train_rnn.py | 3 +- 11 files changed, 29 insertions(+), 24 deletions(-) diff --git a/egs/ami/s5/local/chain/run_blstm_ami_5.sh b/egs/ami/s5/local/chain/run_blstm_ami_5.sh index c15f9e7fd6f..5943494d8e1 100755 --- a/egs/ami/s5/local/chain/run_blstm_ami_5.sh +++ b/egs/ami/s5/local/chain/run_blstm_ami_5.sh @@ -128,7 +128,7 @@ if [ $stage -le 17 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ - --trainer.deriv-truncate-margin 0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh index 71517fac737..522498d847d 100755 --- a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh +++ b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh @@ -192,7 +192,7 @@ if [ $stage -le 12 ]; then --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.shrink-value 0.99 \ --trainer.optimization.momentum 0.0 \ - --trainer.deriv-truncate-margin 0 \ + --trainer.deriv-truncate-margin 10 \ --cleanup.remove-egs $remove_egs \ --feat-dir data/train_rvb_min${min_seg_len}_hires \ --tree-dir $treedir \ diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh index 62bc5d89b6c..c11420e5cfd 100755 --- a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh +++ b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh @@ -187,7 +187,7 @@ if [ $stage -le 12 ]; then --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.shrink-value 0.99 \ --trainer.optimization.momentum 0.0 \ - --trainer.deriv-truncate-margin 0 \ + --trainer.deriv-truncate-margin 10 \ --cleanup.remove-egs $remove_egs \ --feat-dir data/train_rvb_min${min_seg_len}_hires \ --tree-dir $treedir \ diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh index 6865a739e43..a48e7ed55af 100644 --- a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh +++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh @@ -127,7 +127,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ - --trainer.deriv-truncate-margin 0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/multi_en/s5/local/chain/run_blstm_6h.sh b/egs/multi_en/s5/local/chain/run_blstm_6h.sh index 81a0cd87463..5a68947282a 100644 --- a/egs/multi_en/s5/local/chain/run_blstm_6h.sh +++ b/egs/multi_en/s5/local/chain/run_blstm_6h.sh @@ -134,7 +134,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ - --trainer.deriv-truncate-margin 0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh index 1aa2d12ecb9..9ab72b40ac2 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh @@ -154,7 +154,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ - --trainer.deriv-truncate-margin 0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh index 4a29093c1c5..3155e21b618 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh @@ -159,7 +159,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ - --trainer.deriv-truncate-margin 0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh index 85b4e3a1ff3..f1a42cc175c 100644 --- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh @@ -159,7 +159,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ - --trainer.deriv-truncate-margin 0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts "--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh index afd5a3e6eb2..a678fe22044 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh @@ -164,7 +164,7 @@ if [ $stage -le 13 ]; then --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.momentum 0.0 \ - --trainer.deriv-truncate-margin 0 \ + --trainer.deriv-truncate-margin 10 \ --egs.stage $get_egs_stage \ --egs.opts="--frames-overlap-per-eg 0" \ --egs.chunk-width $chunk_width \ diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 34bfc7376fc..5f3a25038d1 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -335,10 +335,10 @@ def __init__(self): def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives, raw_model_string, egs_dir, left_context, right_context, apply_deriv_weights, - left_deriv_truncate, right_deriv_truncate, + min_deriv_time, max_deriv_time, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, - shuffle_buffer_size, chunk_width, num_chunk_per_minibatch, + shuffle_buffer_size, num_chunk_per_minibatch, frame_subsampling_factor, truncate_deriv_weights, cache_io_opts, run_opts): # We cannot easily use a single parallel SGE job to do the main training, @@ -348,10 +348,10 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi # but we use the same script for consistency with FF-DNN code deriv_time_opts="" - if not left_deriv_truncate is None: - deriv_time_opts += " --optimization.min-deriv-time={0}".format(-left_deriv_truncate) - if not right_deriv_truncate is None: - deriv_time_opts += " --optimization.max-deriv-time={0}".format(chunk_width - 1 + right_deriv_truncate) + if not min_deriv_time is None: + deriv_time_opts += " --optimization.min-deriv-time={0}".format(min_deriv_time) + if not max_deriv_time is None: + deriv_time_opts += " --optimization.max-deriv-time={0}".format(max_deriv_time) processes = [] for job in range(1,num_jobs+1): @@ -411,10 +411,10 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi def TrainOneIteration(dir, iter, srand, egs_dir, num_jobs, num_archives_processed, num_archives, - learning_rate, shrinkage_value, chunk_width, num_chunk_per_minibatch, + learning_rate, shrinkage_value, num_chunk_per_minibatch, num_hidden_layers, add_layers_period, left_context, right_context, - apply_deriv_weights, left_deriv_truncate, right_deriv_truncate, + apply_deriv_weights, min_deriv_time, max_deriv_time, l2_regularize, xent_regularize, leaky_hmm_coefficient, momentum, max_param_change, shuffle_buffer_size, frame_subsampling_factor, truncate_deriv_weights, @@ -488,15 +488,14 @@ def TrainOneIteration(dir, iter, srand, egs_dir, left_context = left_context, right_context = right_context, apply_deriv_weights = apply_deriv_weights, - left_deriv_truncate = left_deriv_truncate, - right_deriv_truncate = right_deriv_truncate, + min_deriv_time = min_deriv_time, + max_deriv_time = max_deriv_time, l2_regularize = l2_regularize, xent_regularize = xent_regularize, leaky_hmm_coefficient = leaky_hmm_coefficient, momentum = momentum, max_param_change = cur_max_param_change, shuffle_buffer_size = shuffle_buffer_size, - chunk_width = chunk_width, num_chunk_per_minibatch = cur_num_chunk_per_minibatch, frame_subsampling_factor = frame_subsampling_factor, truncate_deriv_weights = truncate_deriv_weights, @@ -671,6 +670,12 @@ def Train(args, run_opts): args.initial_effective_lrate, args.final_effective_lrate) + min_deriv_time = None + max_deriv_time = None + if not args.deriv_truncate_margin is None: + min_deriv_time = -args.deriv_truncate_margin + max_deriv_time = args.chunk_width - 1 + args.deriv_truncate_margin + logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters)) for iter in range(num_iters): if (args.exit_stage is not None) and (iter == args.exit_stage): @@ -695,15 +700,14 @@ def Train(args, run_opts): num_archives = num_archives, learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value = shrinkage_value, - chunk_width = args.chunk_width, num_chunk_per_minibatch = args.num_chunk_per_minibatch, num_hidden_layers = num_hidden_layers, add_layers_period = args.add_layers_period, left_context = left_context, right_context = right_context, apply_deriv_weights = args.apply_deriv_weights, - left_deriv_truncate = args.deriv_truncate_margin, - right_deriv_truncate = args.deriv_truncate_margin, + min_deriv_time = min_deriv_time, + max_deriv_time = max_deriv_time, l2_regularize = args.l2_regularize, xent_regularize = args.xent_regularize, leaky_hmm_coefficient = args.leaky_hmm_coefficient, diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py index 558192bf026..89db4276cfc 100755 --- a/egs/wsj/s5/steps/nnet3/train_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_rnn.py @@ -194,7 +194,7 @@ def GetArgs(): help="Number of sequences to be processed in parallel every minibatch" ) parser.add_argument("--trainer.rnn.num-bptt-steps", type=int, dest='num_bptt_steps', default=None, - help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." ) + help="The number of time steps to back-propagate from the last label in the chunk. By default it is set to (chunk-width + 10)." ) # General options parser.add_argument("--stage", type=int, default=-4, @@ -651,6 +651,7 @@ def Train(args, run_opts): cur_egs_dir=egs_dir if args.num_bptt_steps is None: + # num_bptt_steps is set to (chunk_width + 10) by default num_bptt_steps = args.chunk_width + min(10, args.chunk_left_context, args.chunk_right_context) else: num_bptt_steps = args.num_bptt_steps