Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion egs/ami/s5/local/chain/run_blstm_ami_5.sh
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ if [ $stage -le 17 ]; then
--chain.l2-regularize 0.00005 \
--chain.apply-deriv-weights false \
--chain.lm-opts="--num-extra-lm-states=2000" \
--chain.left-deriv-truncate 0 \
--trainer.num-chunk-per-minibatch 128 \
--trainer.frames-per-iter 1200000 \
--trainer.max-param-change 2.0 \
Expand All @@ -129,6 +128,7 @@ if [ $stage -le 17 ]; then
--trainer.optimization.initial-effective-lrate 0.001 \
--trainer.optimization.final-effective-lrate 0.0001 \
--trainer.optimization.momentum 0.0 \
--trainer.deriv-truncate-margin 10 \
--egs.stage $get_egs_stage \
--egs.opts "--frames-overlap-per-eg 0" \
--egs.chunk-width $chunk_width \
Expand Down
2 changes: 1 addition & 1 deletion egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,6 @@ if [ $stage -le 12 ]; then
--chain.l2-regularize 0.00005 \
--chain.apply-deriv-weights false \
--chain.lm-opts="--num-extra-lm-states=2000" \
--chain.left-deriv-truncate 0 \
--trainer.num-chunk-per-minibatch 64 \
--trainer.max-param-change 1.414 \
--egs.stage $get_egs_stage \
Expand All @@ -193,6 +192,7 @@ if [ $stage -le 12 ]; then
--trainer.optimization.final-effective-lrate 0.0001 \
--trainer.optimization.shrink-value 0.99 \
--trainer.optimization.momentum 0.0 \
--trainer.deriv-truncate-margin 10 \
--cleanup.remove-egs $remove_egs \
--feat-dir data/train_rvb_min${min_seg_len}_hires \
--tree-dir $treedir \
Expand Down
2 changes: 1 addition & 1 deletion egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@ if [ $stage -le 12 ]; then
--chain.l2-regularize 0.00005 \
--chain.apply-deriv-weights false \
--chain.lm-opts="--num-extra-lm-states=2000" \
--chain.left-deriv-truncate 0 \
--trainer.num-chunk-per-minibatch 64 \
--trainer.max-param-change 1.414 \
--egs.stage $get_egs_stage \
Expand All @@ -188,6 +187,7 @@ if [ $stage -le 12 ]; then
--trainer.optimization.final-effective-lrate 0.0001 \
--trainer.optimization.shrink-value 0.99 \
--trainer.optimization.momentum 0.0 \
--trainer.deriv-truncate-margin 10 \
--cleanup.remove-egs $remove_egs \
--feat-dir data/train_rvb_min${min_seg_len}_hires \
--tree-dir $treedir \
Expand Down
2 changes: 1 addition & 1 deletion egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ if [ $stage -le 13 ]; then
--chain.l2-regularize 0.00005 \
--chain.apply-deriv-weights false \
--chain.lm-opts="--num-extra-lm-states=2000" \
--chain.left-deriv-truncate 0 \
--trainer.num-chunk-per-minibatch 64 \
--trainer.frames-per-iter 1200000 \
--trainer.max-param-change 1.414 \
Expand All @@ -128,6 +127,7 @@ if [ $stage -le 13 ]; then
--trainer.optimization.initial-effective-lrate 0.001 \
--trainer.optimization.final-effective-lrate 0.0001 \
--trainer.optimization.momentum 0.0 \
--trainer.deriv-truncate-margin 10 \
--egs.stage $get_egs_stage \
--egs.opts "--frames-overlap-per-eg 0" \
--egs.chunk-width $chunk_width \
Expand Down
2 changes: 1 addition & 1 deletion egs/multi_en/s5/local/chain/run_blstm_6h.sh
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,6 @@ if [ $stage -le 13 ]; then
--chain.l2-regularize 0.00005 \
--chain.apply-deriv-weights false \
--chain.lm-opts="--num-extra-lm-states=2000" \
--chain.left-deriv-truncate 0 \
--trainer.num-chunk-per-minibatch 64 \
--trainer.frames-per-iter 1200000 \
--trainer.max-param-change 1.414 \
Expand All @@ -135,6 +134,7 @@ if [ $stage -le 13 ]; then
--trainer.optimization.initial-effective-lrate 0.001 \
--trainer.optimization.final-effective-lrate 0.0001 \
--trainer.optimization.momentum 0.0 \
--trainer.deriv-truncate-margin 10 \
--egs.stage $get_egs_stage \
--egs.opts "--frames-overlap-per-eg 0" \
--egs.chunk-width $chunk_width \
Expand Down
2 changes: 1 addition & 1 deletion egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,6 @@ if [ $stage -le 13 ]; then
--chain.l2-regularize 0.00005 \
--chain.apply-deriv-weights false \
--chain.lm-opts="--num-extra-lm-states=2000" \
--chain.left-deriv-truncate 0 \
--trainer.num-chunk-per-minibatch 64 \
--trainer.frames-per-iter 1200000 \
--trainer.max-param-change 2.0 \
Expand All @@ -155,6 +154,7 @@ if [ $stage -le 13 ]; then
--trainer.optimization.initial-effective-lrate 0.001 \
--trainer.optimization.final-effective-lrate 0.0001 \
--trainer.optimization.momentum 0.0 \
--trainer.deriv-truncate-margin 10 \
--egs.stage $get_egs_stage \
--egs.opts "--frames-overlap-per-eg 0" \
--egs.chunk-width $chunk_width \
Expand Down
2 changes: 1 addition & 1 deletion egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,6 @@ if [ $stage -le 13 ]; then
--chain.l2-regularize 0.00005 \
--chain.apply-deriv-weights false \
--chain.lm-opts="--num-extra-lm-states=2000" \
--chain.left-deriv-truncate 0 \
--trainer.num-chunk-per-minibatch 64 \
--trainer.frames-per-iter 1200000 \
--trainer.max-param-change 2.0 \
Expand All @@ -161,6 +160,7 @@ if [ $stage -le 13 ]; then
--trainer.optimization.initial-effective-lrate 0.001 \
--trainer.optimization.final-effective-lrate 0.0001 \
--trainer.optimization.momentum 0.0 \
--trainer.deriv-truncate-margin 10 \
--egs.stage $get_egs_stage \
--egs.opts "--frames-overlap-per-eg 0" \
--egs.chunk-width $chunk_width \
Expand Down
2 changes: 1 addition & 1 deletion egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,6 @@ if [ $stage -le 13 ]; then
--chain.l2-regularize 0.00005 \
--chain.apply-deriv-weights false \
--chain.lm-opts="--num-extra-lm-states=2000" \
--chain.left-deriv-truncate 0 \
--trainer.num-chunk-per-minibatch 64 \
--trainer.frames-per-iter 1200000 \
--trainer.max-param-change 2.0 \
Expand All @@ -160,6 +159,7 @@ if [ $stage -le 13 ]; then
--trainer.optimization.initial-effective-lrate 0.001 \
--trainer.optimization.final-effective-lrate 0.0001 \
--trainer.optimization.momentum 0.0 \
--trainer.deriv-truncate-margin 10 \
--egs.stage $get_egs_stage \
--egs.opts "--frames-overlap-per-eg 0" \
--egs.chunk-width $chunk_width \
Expand Down
2 changes: 1 addition & 1 deletion egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,6 @@ if [ $stage -le 13 ]; then
--chain.l2-regularize 0.00005 \
--chain.apply-deriv-weights false \
--chain.lm-opts="--num-extra-lm-states=2000" \
--chain.left-deriv-truncate 0 \
--trainer.num-chunk-per-minibatch 64 \
--trainer.frames-per-iter 1200000 \
--trainer.max-param-change 2.0 \
Expand All @@ -160,6 +159,7 @@ if [ $stage -le 13 ]; then
--trainer.optimization.initial-effective-lrate 0.001 \
--trainer.optimization.final-effective-lrate 0.0001 \
--trainer.optimization.momentum 0.0 \
--trainer.deriv-truncate-margin 10 \
--egs.stage $get_egs_stage \
--egs.opts "--frames-overlap-per-eg 0" \
--egs.chunk-width $chunk_width \
Expand Down
2 changes: 1 addition & 1 deletion egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,6 @@ if [ $stage -le 13 ]; then
--chain.xent-regularize $xent_regularize \
--chain.apply-deriv-weights false \
--chain.lm-opts="--num-extra-lm-states=2000" \
--chain.left-deriv-truncate 0 \
--trainer.num-chunk-per-minibatch 64 \
--trainer.max-param-change 2.0 \
--trainer.num-epochs 4 \
Expand All @@ -165,6 +164,7 @@ if [ $stage -le 13 ]; then
--trainer.optimization.initial-effective-lrate 0.001 \
--trainer.optimization.final-effective-lrate 0.0001 \
--trainer.optimization.momentum 0.0 \
--trainer.deriv-truncate-margin 10 \
--egs.stage $get_egs_stage \
--egs.opts="--frames-overlap-per-eg 0" \
--egs.chunk-width $chunk_width \
Expand Down
44 changes: 29 additions & 15 deletions egs/wsj/s5/steps/nnet3/chain/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,7 @@ def GetArgs():
" chain model's output")
parser.add_argument("--chain.left-deriv-truncate", type=int,
dest='left_deriv_truncate',
default = None, help="")
parser.add_argument("--chain.right-deriv-truncate", type=int,
dest='right_deriv_truncate',
default = None, help="")

default = None, help="Deprecated. Kept for back compatibility")

# trainer options
parser.add_argument("--trainer.srand", type=int, dest='srand',
Expand Down Expand Up @@ -224,6 +220,12 @@ def GetArgs():
parser.add_argument("--trainer.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
default=512,
help="Number of sequences to be processed in parallel every minibatch" )
parser.add_argument("--trainer.deriv-truncate-margin", type=int, dest='deriv_truncate_margin',
default = None,
help="If specified, it is the number of frames that the derivative will be backpropagated through the chunk boundaries, "
"e.g., During BLSTM model training if the chunk-width=150 and deriv-truncate-margin=5, then the derivative will be "
"backpropagated up to t=-5 and t=154 in the forward and backward LSTM sequence respectively; "
"otherwise, the derivative will be backpropagated to the end of the sequence.")

# General options
parser.add_argument("--stage", type=int, default=-4,
Expand Down Expand Up @@ -284,6 +286,12 @@ def ProcessArgs(args):
if args.chunk_right_context < 0:
raise Exception("--egs.chunk-right-context should be non-negative")

if not args.left_deriv_truncate is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We recommend using the option --trainer.deriv-truncate-margin.

args.deriv_truncate_margin = -args.left_deriv_truncate
logger.warning("--chain.left-deriv-truncate (deprecated) is set by user, "
"and --trainer.deriv-truncate-margin is set to negative of that value={0}. "
"We recommend using the option --trainer.deriv-truncate-margin.".format(args.deriv_truncate_margin))

if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
raise Exception("""This scripts expects {0} to exist and have a configs
directory which is the output of make_configs.py script""")
Expand Down Expand Up @@ -327,7 +335,7 @@ def __init__(self):
def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
raw_model_string, egs_dir, left_context, right_context,
apply_deriv_weights,
left_deriv_truncate, right_deriv_truncate,
min_deriv_time, max_deriv_time,
l2_regularize, xent_regularize, leaky_hmm_coefficient,
momentum, max_param_change,
shuffle_buffer_size, num_chunk_per_minibatch,
Expand All @@ -340,10 +348,10 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi
# but we use the same script for consistency with FF-DNN code

deriv_time_opts=""
if left_deriv_truncate is not None:
deriv_time_opts += " --optimization.min-deriv-time={0}".format(left_deriv_truncate)
if right_deriv_truncate is not None:
deriv_time_opts += " --optimization.max-deriv-time={0}".format(int(chunk-width-right_deriv_truncate))
if not min_deriv_time is None:
deriv_time_opts += " --optimization.min-deriv-time={0}".format(min_deriv_time)
if not max_deriv_time is None:
deriv_time_opts += " --optimization.max-deriv-time={0}".format(max_deriv_time)

processes = []
for job in range(1,num_jobs+1):
Expand Down Expand Up @@ -406,7 +414,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
learning_rate, shrinkage_value, num_chunk_per_minibatch,
num_hidden_layers, add_layers_period,
left_context, right_context,
apply_deriv_weights, left_deriv_truncate, right_deriv_truncate,
apply_deriv_weights, min_deriv_time, max_deriv_time,
l2_regularize, xent_regularize, leaky_hmm_coefficient,
momentum, max_param_change, shuffle_buffer_size,
frame_subsampling_factor, truncate_deriv_weights,
Expand Down Expand Up @@ -480,8 +488,8 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
left_context = left_context,
right_context = right_context,
apply_deriv_weights = apply_deriv_weights,
left_deriv_truncate = left_deriv_truncate,
right_deriv_truncate = right_deriv_truncate,
min_deriv_time = min_deriv_time,
max_deriv_time = max_deriv_time,
l2_regularize = l2_regularize,
xent_regularize = xent_regularize,
leaky_hmm_coefficient = leaky_hmm_coefficient,
Expand Down Expand Up @@ -662,6 +670,12 @@ def Train(args, run_opts):
args.initial_effective_lrate,
args.final_effective_lrate)

min_deriv_time = None
max_deriv_time = None
if not args.deriv_truncate_margin is None:
min_deriv_time = -args.deriv_truncate_margin
max_deriv_time = args.chunk_width - 1 + args.deriv_truncate_margin

logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
for iter in range(num_iters):
if (args.exit_stage is not None) and (iter == args.exit_stage):
Expand Down Expand Up @@ -692,8 +706,8 @@ def Train(args, run_opts):
left_context = left_context,
right_context = right_context,
apply_deriv_weights = args.apply_deriv_weights,
left_deriv_truncate = args.left_deriv_truncate,
right_deriv_truncate = args.right_deriv_truncate,
min_deriv_time = min_deriv_time,
max_deriv_time = max_deriv_time,
l2_regularize = args.l2_regularize,
xent_regularize = args.xent_regularize,
leaky_hmm_coefficient = args.leaky_hmm_coefficient,
Expand Down
16 changes: 10 additions & 6 deletions egs/wsj/s5/steps/nnet3/train_rnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def GetArgs():
help="Number of sequences to be processed in parallel every minibatch" )
parser.add_argument("--trainer.rnn.num-bptt-steps", type=int, dest='num_bptt_steps',
default=None,
help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." )
help="The number of time steps to back-propagate from the last label in the chunk. By default it is set to (chunk-width + 10)." )

# General options
parser.add_argument("--stage", type=int, default=-4,
Expand Down Expand Up @@ -346,7 +346,7 @@ def __init__(self):

def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
raw_model_string, egs_dir,
left_context, right_context, min_deriv_time,
left_context, right_context, min_deriv_time, max_deriv_time,
momentum, max_param_change,
shuffle_buffer_size, num_chunk_per_minibatch,
cache_read_opt, run_opts):
Expand Down Expand Up @@ -375,7 +375,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi
nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
--print-interval=10 --momentum={momentum} \
--max-param-change={max_param_change} \
--optimization.min-deriv-time={min_deriv_time} "{raw_model}" \
--optimization.min-deriv-time={min_deriv_time} --optimization.max-deriv-time={max_deriv_time} "{raw_model}" \
"ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
{dir}/{next_iter}.{job}.raw
""".format(command = run_opts.command,
Expand All @@ -384,7 +384,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi
parallel_train_opts = run_opts.parallel_train_opts,
cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt,
momentum = momentum, max_param_change = max_param_change,
min_deriv_time = min_deriv_time,
min_deriv_time = min_deriv_time, max_deriv_time = max_deriv_time,
raw_model = raw_model_string, context_opts = context_opts,
egs_dir = egs_dir, archive_index = archive_index,
shuffle_buffer_size = shuffle_buffer_size,
Expand All @@ -409,7 +409,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
num_jobs, num_archives_processed, num_archives,
learning_rate, shrinkage_value, num_chunk_per_minibatch,
num_hidden_layers, add_layers_period,
left_context, right_context, min_deriv_time,
left_context, right_context, min_deriv_time, max_deriv_time,
momentum, max_param_change, shuffle_buffer_size,
cv_minibatch_size, run_opts):
# Set off jobs doing some diagnostics, in the background.
Expand Down Expand Up @@ -490,6 +490,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
left_context = left_context,
right_context = right_context,
min_deriv_time = min_deriv_time,
max_deriv_time = max_deriv_time,
momentum = momentum,
max_param_change = max_param_change,
shuffle_buffer_size = shuffle_buffer_size,
Expand Down Expand Up @@ -650,11 +651,13 @@ def Train(args, run_opts):
cur_egs_dir=egs_dir

if args.num_bptt_steps is None:
num_bptt_steps = args.chunk_width
# num_bptt_steps is set to (chunk_width + 10) by default
num_bptt_steps = args.chunk_width + min(10, args.chunk_left_context, args.chunk_right_context)
else:
num_bptt_steps = args.num_bptt_steps

min_deriv_time = args.chunk_width - num_bptt_steps
max_deriv_time = num_bptt_steps - 1


logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
Expand Down Expand Up @@ -695,6 +698,7 @@ def Train(args, run_opts):
left_context = left_context,
right_context = right_context,
min_deriv_time = min_deriv_time,
max_deriv_time = max_deriv_time,
momentum = args.momentum,
max_param_change= args.max_param_change,
shuffle_buffer_size = args.shuffle_buffer_size,
Expand Down