From 13e8bed04d7c4b043badc2e8d17513c3e6b144a3 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 30 Jan 2017 20:41:35 -0500
Subject: [PATCH 01/21] [src,scripts,egs] nnet3,fast-lstm: changes to support
 separate per-frame dropout masks on i and f gates.  Old dropout method not
 supported in this branch.

---
 .../local/chain/tuning/run_tdnn_lstm_1p.sh    | 344 +++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1q.sh    | 348 ++++++++++++++++++
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py   |  50 +--
 src/cudamatrix/cu-kernels-ansi.h              |   8 +-
 src/cudamatrix/cu-kernels.cu                  |  66 ++--
 src/cudamatrix/cu-kernels.h                   |  16 +-
 src/cudamatrix/cu-math-test.cc                |  34 +-
 src/cudamatrix/cu-math.cc                     |  76 ++--
 src/cudamatrix/cu-math.h                      |  53 +--
 src/nnet3/nnet-component-itf.cc               |   2 +
 src/nnet3/nnet-component-itf.h                |   5 +-
 src/nnet3/nnet-general-component.cc           | 100 +++++
 src/nnet3/nnet-general-component.h            |  89 +++++
 src/nnet3/nnet-simple-component.cc            |  36 +-
 src/nnet3/nnet-simple-component.h             |  27 +-
 src/nnet3/nnet-utils.cc                       |  17 +-
 src/nnet3/nnet-utils.h                        |   2 +-
 17 files changed, 1138 insertions(+), 135 deletions(-)
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1q.sh

diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh
new file mode 100755
index 00000000000..246601d8535
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh
@@ -0,0 +1,344 @@
+#!/bin/bash
+
+# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng
+# did it in the non-fast LSTMs, with separate per-frame masks on
+# the i and f component.  Using dropout schedule that maxes out at
+# 0.3, which he found worked best for that type of dropout.
+#
+#
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi
+# WER on dev(orig)            9.0       8.7       8.9       9.1
+#         [looped:]           9.0       8.6       8.8       9.0
+# WER on dev(rescored)        8.4       7.9       8.4       8.3
+#         [looped:]           8.4       7.8       8.3       8.2
+# WER on test(orig)           8.8       8.8       8.7       8.9
+#         [looped:]           8.8       8.7       8.6       8.9
+# WER on test(rescored)       8.4       8.3       8.1       8.3
+#         [looped:]           8.3       8.3       8.1       8.3
+# Final train prob        -0.0648   -0.0693   -0.0712   -0.0698
+# Final valid prob        -0.0827   -0.0854   -0.0848   -0.0875
+# Final train prob (xent)   -0.8372   -0.8848   -0.8903   -0.8721
+# Final valid prob (xent)   -0.9497   -0.9895   -0.9719   -0.9828
+#
+# 1k is as 1e, but introducing a dropout schedule.
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1p  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  # note: the value of the dropout-proportion is not important, as it's
+  # controlled by the dropout schedule; what's important is that we set it.
+  lstmp_opts="decay-time=20 dropout-proportion=0.0"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --trainer.dropout-schedule='0,0@0.20,0.3@0.5,0@0.75,0' \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir \
+    --cleanup=false
+ # --cleanup=false is temporary while debugging.
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1q.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1q.sh
new file mode 100755
index 00000000000..f6a640fe17f
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1q.sh
@@ -0,0 +1,348 @@
+#!/bin/bash
+
+# 1q is as 1p, but add the "dropout-exclusive" option which means that
+# never drops out *both* the i and f gates.
+# not helpful.  see run_tdnn_lstm_1p.sh for results.
+
+# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng
+# did it in the non-fast LSTMs, with separate per-frame masks on
+# the i and f component.  Using dropout schedule that maxes out at
+# 0.3, which he found worked best for that type of dropout.
+#
+# 1k is as 1e, but introducing a dropout schedule.
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,l,m}_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1l_sp_bi tdnn_lstm1m_sp_bi
+# WER on dev(orig)            9.0       8.7       8.9       9.0
+#         [looped:]           9.0       8.6       8.9       8.9
+# WER on dev(rescored)        8.4       7.9       8.2       8.2
+#         [looped:]           8.4       7.8       8.2       8.3
+# WER on test(orig)           8.8       8.8       8.9       8.9
+#         [looped:]           8.8       8.7       8.8       8.8
+# WER on test(rescored)       8.4       8.3       8.2       8.5
+#         [looped:]           8.3       8.3       8.3       8.4
+# Final train prob        -0.0648   -0.0693   -0.0768   -0.0807
+# Final valid prob        -0.0827   -0.0854   -0.0943   -0.0931
+# Final train prob (xent)   -0.8372   -0.8848   -0.9371   -0.9807
+# Final valid prob (xent)   -0.9497   -0.9895   -1.0546   -1.0629
+
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1q  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  # note: the value of the dropout-proportion is not important, as it's
+  # controlled by the dropout schedule; what's important is that we set it.
+  lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-exclusive=true"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --trainer.dropout-schedule='0,0@0.20,0.3@0.5,0@0.75,0' \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir \
+    --cleanup=false
+ # --cleanup=false is temporary while debugging.
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 9d7f649c4b4..f6d93808538 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -682,9 +682,12 @@ def set_default_configs(self):
                         'decay-time':  -1.0,
                         'zeroing-interval' : 20,
                         'zeroing-threshold' : 15.0,
-                        'dropout-proportion' : -1.0, # If -1.0, no dropout components will be added
-                        'dropout-per-frame' : False  # If false, regular dropout, not per frame.
-                        }
+                        'dropout-proportion' : -1.0, # If -1.0, no dropout will
+                                                     # be used (note: this is
+                                                     # per-frame dropout on the
+                                                     # output of the i_t and f_t gates)
+                        'dropout-exclusive' : False  # option affecting dropout masks.
+                         }
 
     def set_derived_configs(self):
         if self.config['cell-dim'] <= 0:
@@ -717,7 +720,6 @@ def check_configs(self):
             raise RuntimeError("dropout-proportion has invalid value {0}.".format(self.config['dropout-proportion']))
 
 
-
     def auxiliary_outputs(self):
         return ['c_t']
 
@@ -785,7 +787,7 @@ def generate_lstm_config(self):
 
         lstm_str = self.config['lstm-nonlinearity-options']
         dropout_proportion = self.config['dropout-proportion']
-        dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false'
+        dropout_exclusive = 'true' if self.config['dropout-exclusive'] else 'false'
 
         configs = []
 
@@ -800,14 +802,16 @@ def generate_lstm_config(self):
         configs.append("# The core LSTM nonlinearity, implemented as a single component.")
         configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)")
         configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.")
-        configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, lstm_str))
+        configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent cell-dim={1} "
+                       "use-dropout={2} {3}"
+                       .format(name, cell_dim, "true" if dropout_proportion != -1.0 else "false", lstm_str))
         configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
         configs.append("component name={0}.cr_trunc type=BackpropTruncationComponent "
                        "dim={1} {2}".format(name, cell_dim + rec_proj_dim, bptrunc_str))
         if dropout_proportion != -1.0:
-            configs.append("component name={0}.cr_trunc.dropout type=DropoutComponent dim={1} "
-                           "dropout-proportion={2} dropout-per-frame={3}"
-                           .format(name, cell_dim + rec_proj_dim, dropout_proportion, dropout_per_frame))
+            configs.append("component name={0}.dropout_mask type=DropoutMaskComponent output-dim=2 "
+                           "dropout-proportion={1} exclusive={2}"
+                           .format(name, dropout_proportion, dropout_exclusive))
         configs.append("# Component specific to 'projected' LSTM (LSTMP), contains both recurrent");
         configs.append("# and non-recurrent projections")
         configs.append("component name={0}.W_rp type=NaturalGradientAffineComponent input-dim={1} "
@@ -816,8 +820,17 @@ def generate_lstm_config(self):
         configs.append("###  Nodes for the components above.")
         configs.append("component-node name={0}.four_parts component={0}.W_all input=Append({1}, "
                        "IfDefined(Offset({0}.r_trunc, {2})))".format(name, input_descriptor, delay))
-        configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
-                       "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay))
+        if dropout_proportion != -1.0:
+            # note: the 'input' is a don't-care as the component never uses it; it's required
+            # in component-node lines.
+            configs.append("component-node name={0}.dropout_mask component={0}.dropout_mask "
+                           "input={0}.dropout_mask".format(name))
+            configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
+                           "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})), {0}.dropout_mask)"
+                           .format(name, delay))
+        else:
+            configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
+                           "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay))
         configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin "
                        "dim-offset=0 dim={1}".format(name, cell_dim))
         configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin "
@@ -831,17 +844,10 @@ def generate_lstm_config(self):
         configs.append("# makes the deriv truncation more accurate .")
         configs.append("component-node name={0}.cr_trunc component={0}.cr_trunc "
                        "input=Append({0}.c, {0}.r)".format(name))
-        if dropout_proportion != -1.0:
-            configs.append("component-node name={0}.cr_trunc.dropout component={0}.cr_trunc.dropout input={0}.cr_trunc".format(name))
-            configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc.dropout "
-                           "dim-offset=0 dim={1}".format(name, cell_dim))
-            configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc.dropout "
-                           "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim))
-        else:
-            configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc "
-                           "dim-offset=0 dim={1}".format(name, cell_dim))
-            configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc "
-                           "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim))
+        configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc "
+                       "dim-offset=0 dim={1}".format(name, cell_dim))
+        configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc "
+                       "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim))
         configs.append("### End LSTM Layer '{0}'".format(name))
 
         return configs
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 116428ea82c..291d6a72cf3 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -650,14 +650,17 @@ void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
 void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
                              const int in_stride, const double* params,
                              const int params_stride, const int out_stride,
-                             const int cell_dim, const int num_rows,
+                             const int cell_dim, const int have_dropout_mask,
+                             const int num_rows,
                              double* out);
 void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
                              const int in_stride, const float* params,
                              const int params_stride, const int out_stride,
-                             const int cell_dim, const int num_rows,
+                             const int cell_dim, const int have_dropout_mask,
+                             const int num_rows,
                              float* out);
 void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                  const int have_dropout_mask,
                                   const int num_rows, const double* input,
                                   const int in_stride, const double* params,
                                   const int params_stride,
@@ -677,6 +680,7 @@ void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
                                   double* self_repair_sum_out,
                                   const int self_repair_sum_out_stride);
 void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                  const int have_dropout_mask,
                                   const int num_rows, const float* input,
                                   const int in_stride, const float* params,
                                   const int params_stride,
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index abb4efd47ef..f50e5853fdd 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -2722,6 +2722,9 @@ static void _diff_log_softmax(const MatrixDim in_deriv_dim,
                      consecutive blocks, each of dimension cell_dim,
                      which we name:
                      (i_part, f_part, c_part, o_part, c_{t-1}).
+                     If 'have_dropout_mask' is nonzero, each row of
+                     'in' will have two extra elements, interpreted
+                     as dropout masks/scales for i_t and f_t.
  @param [in] params  A matrix, of dimension 3 by cell_dim,
                      with rows containing the 3 diagonal parameter matrices
                      used in LSTMs, namely
@@ -2746,7 +2749,8 @@ __global__
 static void _lstm_nonlinearity(const Real* in, const int in_stride,
                                const Real* params, const int params_stride,
                                const int out_stride, const int cell_dim,
-                               const int num_rows, Real* out) {
+                               const int have_dropout_mask, const int num_rows,
+                               Real* out) {
   const int tid = threadIdx.x;
   const int i = blockIdx.x;
   const Real* i_part = in + i * in_stride;
@@ -2759,12 +2763,14 @@ static void _lstm_nonlinearity(const Real* in, const int in_stride,
   const Real* w_oc = params + params_stride * 2;
   Real* c_t = out + i * out_stride;
   Real* m_t = out + i * out_stride + cell_dim;
+  Real i_scale = (have_dropout_mask ? in[i * in_stride + cell_dim * 5] : 1),
+       f_scale = (have_dropout_mask ? in[i * in_stride + cell_dim * 5 + 1] : 1);
 
   for (int j = tid; j < cell_dim; j += CU1DBLOCK) {
     Real c_tm1_j = c_tm1[j];
     Real i_t_j = Real(1) / (Real(1) + exp(-i_part[j] - w_ic[j] * c_tm1_j));
     Real f_t_j = Real(1) / (Real(1) + exp(-f_part[j] - w_fc[j] * c_tm1_j));
-    Real c_t_j = f_t_j * c_tm1_j + i_t_j * tanh(c_part[j]);
+    Real c_t_j = f_t_j * f_scale * c_tm1_j + i_t_j * i_scale * tanh(c_part[j]);
     Real o_t_j = Real(1) / (Real(1) + exp(-o_part[j] - w_oc[j] * c_t_j));
     c_t[j] = c_t_j;
     m_t[j] = o_t_j * tanh(c_t_j);
@@ -2792,6 +2798,9 @@ static void _lstm_nonlinearity(const Real* in, const int in_stride,
                      a multiple of 5).  The column-space is interpreted as 5
                      consecutive blocks, each of dimension C, which we name:
                      (i_part, f_part, c_part, o_part, c_{t-1}).
+                     If 'have_dropout_mask' is nonzero, each row of
+                     'in' will have two extra elements, interpreted
+                     as dropout masks/scales for i_t and f_t.
  @param [in] params  The same as in ComputeLstmNonlinearity().
                      A matrix, of dimension 3 by C, with rows containing the
                      three diagonal parameter matrices used in LSTMs, namely
@@ -2864,7 +2873,8 @@ static void _lstm_nonlinearity(const Real* in, const int in_stride,
 */
 template<typename Real>
 __global__
-static void _diff_lstm_nonlinearity(const int cell_dim, const int num_rows,
+static void _diff_lstm_nonlinearity(const int cell_dim, const int have_dropout_mask,
+                                    const int num_rows,
                                     const Real* input, const int input_stride,
                                     const Real* params, const int params_stride,
                                     const Real* output_deriv,
@@ -2918,6 +2928,7 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int num_rows,
     const Real o_t_self_repair = (update_sr[3] ? sr_config[8] : 0);
     const Real c_t_self_repair = (update_sr[4] ? sr_config[9] : 0);
 
+
     for (int i = i0; i < num_rows; i += grid_stride) {
       const Real i_part = input[i * input_stride + j];
       const Real f_part = input[i * input_stride + j + cell_dim];
@@ -2925,10 +2936,15 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int num_rows,
       const Real o_part = input[i * input_stride + j + 3 * cell_dim];
       const Real c_prev = input[i * input_stride + j + 4 * cell_dim];
 
-      const Real i_t = 1 / (1 + exp(-i_part - w_ic * c_prev));
-      const Real f_t = 1 / (1 + exp(-f_part - w_fc * c_prev));
+
+      const Real i_scale = (have_dropout_mask ?
+                            input[i * input_stride + cell_dim * 5] : 1),
+                 f_scale = (have_dropout_mask ?
+                            input[i * input_stride + cell_dim * 5 + 1] :1);
+      const Real i_t = Real(1) / (1 + exp(-i_part - w_ic * c_prev));
+      const Real f_t = Real(1) / (1 + exp(-f_part - w_fc * c_prev));
       const Real tanh_c_part = tanh(c_part);
-      const Real c_t = f_t * c_prev + i_t * tanh_c_part;
+      const Real c_t = f_t * f_scale * c_prev + i_t * i_scale * tanh_c_part;
       const Real o_t = 1 / (1 + exp(-o_part - w_oc * c_t));
       const Real tanh_c_t = tanh(c_t);
 
@@ -2962,13 +2978,13 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int num_rows,
 
       const Real dc_t = (c_t_deriv * dtanh_c_t + dc_t_out + do_t_input * w_oc)
           - tanh_c_t * c_t_self_repair;
-      const Real dtanh_c_part = i_t * dc_t;
-      const Real df_t = dc_t * c_prev;
+      const Real dtanh_c_part = i_t * i_scale * dc_t;
+      const Real df_t = dc_t * f_scale * c_prev;
       const Real df_t_input = (df_t * f_t_deriv
-          - (2 * f_t - 1) * f_t_self_repair);
-      const Real di_t = dc_t * tanh_c_part;
+                               - (2 * f_t - 1) * f_t_self_repair);
+      const Real di_t = dc_t * i_scale * tanh_c_part;
       const Real di_t_input = (di_t * i_t_deriv
-          - (2 * i_t - 1) * i_t_self_repair);
+                               - (2 * i_t - 1) * i_t_self_repair);
 
       if (params_deriv) {
         w_ic_deriv_sum += c_prev * di_t_input;
@@ -2976,7 +2992,7 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int num_rows,
         w_oc_deriv_sum += c_t * do_t_input;
       }
 
-      const Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * dc_t;
+      const Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * f_scale * dc_t;
       const Real do_part = do_t_input;
       const Real dc_part = (c_part_deriv * dtanh_c_part
           - tanh_c_part * c_part_self_repair);
@@ -4591,20 +4607,23 @@ void cudaD_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const double* mat_in,
 void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
                              const int in_stride, const double* params,
                              const int params_stride, const int out_stride,
-                             const int cell_dim, const int num_rows,
-                             double* out) {
-  _lstm_nonlinearity<<<Gr, Bl>>>(in, in_stride, params, params_stride,
-      out_stride, cell_dim, num_rows, out);
+                             const int cell_dim, const int have_dropout_mask,
+                             const int num_rows, double* out) {
+  _lstm_nonlinearity<<<Gr, Bl>>>(
+      in, in_stride, params, params_stride,
+      out_stride, cell_dim, have_dropout_mask, num_rows, out);
 }
 void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
                              const int in_stride, const float* params,
                              const int params_stride, const int out_stride,
-                             const int cell_dim, const int num_rows,
-                             float* out) {
-  _lstm_nonlinearity<<<Gr, Bl>>>(in, in_stride, params, params_stride,
-      out_stride, cell_dim, num_rows, out);
+                             const int cell_dim, const int have_dropout_mask,
+                             const int num_rows, float* out) {
+  _lstm_nonlinearity<<<Gr, Bl>>>(
+      in, in_stride, params, params_stride,
+      out_stride, cell_dim, have_dropout_mask, num_rows, out);
 }
 void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                  const int have_dropout_mask,
                                   const int num_rows, const double* input,
                                   const int input_stride, const double* params,
                                   const int params_stride,
@@ -4623,7 +4642,8 @@ void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
                                   const int deriv_sum_out_stride,
                                   double* self_repair_sum_out,
                                   const int self_repair_sum_out_stride) {
-  _diff_lstm_nonlinearity<<<Gr, Bl>>>(cell_dim, num_rows, input,
+  _diff_lstm_nonlinearity<<<Gr, Bl>>>(
+      cell_dim, have_dropout_mask, num_rows, input,
       input_stride, params, params_stride, output_deriv, output_deriv_stride,
       deriv_sum_in, deriv_sum_in_stride, self_repair_config, count, input_deriv,
       input_deriv_stride, params_deriv, params_deriv_stride, value_sum_out,
@@ -4631,6 +4651,7 @@ void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
       self_repair_sum_out, self_repair_sum_out_stride);
 }
 void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                  const int have_dropout_mask,
                                   const int num_rows, const float* input,
                                   const int input_stride, const float* params,
                                   const int params_stride,
@@ -4649,7 +4670,8 @@ void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
                                   const int deriv_sum_out_stride,
                                   float* self_repair_sum_out,
                                   const int self_repair_sum_out_stride) {
-  _diff_lstm_nonlinearity<<<Gr, Bl>>>(cell_dim, num_rows, input,
+  _diff_lstm_nonlinearity<<<Gr, Bl>>>(
+      cell_dim, have_dropout_mask, num_rows, input,
       input_stride, params, params_stride, output_deriv, output_deriv_stride,
       deriv_sum_in, deriv_sum_in_stride, self_repair_config, count, input_deriv,
       input_deriv_stride, params_deriv, params_deriv_stride, value_sum_out,
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 649a25ab67e..0e578ee7b49 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -1258,19 +1258,24 @@ inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
                                    const int in_stride, const double* params,
                                    const int params_stride,
                                    const int out_stride, const int cell_dim,
+                                   const int have_dropout_mask,
                                    const int num_rows, double* out) {
   cudaD_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
-                          out_stride, cell_dim, num_rows, out);
+                          out_stride, cell_dim, have_dropout_mask,
+                          num_rows, out);
 }
 inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
                                    const int in_stride, const float* params,
                                    const int params_stride,
                                    const int out_stride, const int cell_dim,
+                                   const int have_dropout_mask,
                                    const int num_rows, float* out) {
   cudaF_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
-                          out_stride, cell_dim, num_rows, out);
+                          out_stride, cell_dim, have_dropout_mask,
+                          num_rows, out);
 }
 inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                        const int have_dropout_mask,
                                         const int num_rows, const double* input,
                                         const int input_stride,
                                         const double* params,
@@ -1290,7 +1295,8 @@ inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
                                         const int deriv_sum_out_stride,
                                         double* self_repair_sum_out,
                                         const int self_repair_sum_out_stride) {
-  cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride,
+  cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask, num_rows,
+                               input, input_stride,
                                params, params_stride, output_deriv,
                                output_deriv_stride, deriv_sum_in,
                                deriv_sum_in_stride, self_repair_config, count,
@@ -1301,6 +1307,7 @@ inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
                                self_repair_sum_out_stride);
 }
 inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                        const int have_dropout_mask,
                                         const int num_rows, const float* input,
                                         const int input_stride,
                                         const float* params,
@@ -1320,7 +1327,8 @@ inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
                                         const int deriv_sum_out_stride,
                                         float* self_repair_sum_out,
                                         const int self_repair_sum_out_stride) {
-  cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride,
+  cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask,
+                               num_rows, input, input_stride,
                                params, params_stride, output_deriv,
                                output_deriv_stride, deriv_sum_in,
                                deriv_sum_in_stride, self_repair_config, count,
diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index abd93fb1a0a..9abb6c7e8d1 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -144,7 +144,8 @@ static void UnitTestCuMathComputeLstmNonlinearity() {
   for (int i = 0; i < 3; i++) {
     int32 num_rows = 1 + Rand() % 100;
     int32 cell_dim = 1 + Rand() % 2000;
-    Matrix<Real> Hinput(num_rows, 5 * cell_dim);
+    int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2);
+    Matrix<Real> Hinput(num_rows, 5 * cell_dim + dropout_dim);
     Matrix<Real> Hparams(3, cell_dim);
     Matrix<Real> Houtput(num_rows, 2 * cell_dim);
     Hinput.SetRandn();
@@ -165,7 +166,8 @@ static void UnitTestCuMathComputeLstmNonlinearity() {
     BaseFloat time_in_secs = 0.025;
     int32 num_rows = i;
     int32 cell_dim = i;
-    CuMatrix<Real> input(num_rows, 5 * cell_dim);
+    int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2);
+    CuMatrix<Real> input(num_rows, 5 * cell_dim + dropout_dim);
     CuMatrix<Real> params(3, cell_dim);
     CuMatrix<Real> output(num_rows, 2 * cell_dim);
     input.SetRandn();
@@ -190,7 +192,8 @@ void UnitTestLstmNonlinearity() {
 
     // problem dimensions.
     int32 num_rows = RandInt(5, 20),
-        cell_dim = RandInt(2, 200);
+          cell_dim = RandInt(2, 200),
+        dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2);
 
     // Pick the (input or params block), and output block, for which we'll
     // spot-check the derivative values.  This will give us test failures
@@ -207,7 +210,7 @@ void UnitTestLstmNonlinearity() {
       test_params = -1;
 
 
-    CuMatrix<BaseFloat> input(num_rows, cell_dim * 5),
+    CuMatrix<BaseFloat> input(num_rows, cell_dim * 5 + dropout_dim),
         params(3, cell_dim),
         output_deriv(num_rows, cell_dim * 2);
     input.SetRandn();
@@ -230,7 +233,7 @@ void UnitTestLstmNonlinearity() {
     CuVector<BaseFloat> self_repair_config(10.0); // leave at zero... we don't really test this here.
     CuMatrix<BaseFloat>
         self_repair_sum(5, cell_dim),
-        input_deriv(num_rows, 5 * cell_dim),
+        input_deriv(num_rows, 5 * cell_dim + dropout_dim),
         params_deriv(3, cell_dim);
 
     double count_in = 0.0;
@@ -249,7 +252,7 @@ void UnitTestLstmNonlinearity() {
         measured_objf_change(test_dim);
 
     for (int32 i = 0; i < test_dim; i++) {
-      CuMatrix<BaseFloat> delta_input(num_rows, 5 * cell_dim),
+      CuMatrix<BaseFloat> delta_input(num_rows, 5 * cell_dim + dropout_dim),
           delta_params(3, cell_dim);
       if (test_input >= 0) {
         delta_input.ColRange(test_input * cell_dim, cell_dim).SetRandn();
@@ -260,12 +263,9 @@ void UnitTestLstmNonlinearity() {
         delta_params.Scale(delta);
       }
 
-
-
       predicted_objf_change(i) = TraceMatMat(delta_input, input_deriv, kTrans) +
           TraceMatMat(delta_params, params_deriv, kTrans);
 
-
       CuMatrix<BaseFloat> perturbed_input(input);
       perturbed_input.AddMat(1.0, delta_input);
 
@@ -280,7 +280,9 @@ void UnitTestLstmNonlinearity() {
       measured_objf_change(i) = objf_change;
     }
     KALDI_LOG << "LSTM nonlinearity test: num_rows=" << num_rows
-              << ", cell_dim=" << cell_dim << ", test_input=" << test_input
+              << ", cell_dim=" << cell_dim
+              << ", dropout_dim=" << dropout_dim
+              << ", test_input=" << test_input
               << ", test_params=" << test_params
               << ", test_output=" << test_output
               << ", predicted_objf_change=" << predicted_objf_change
@@ -296,16 +298,17 @@ template<typename Real>
 static void UnitTestBackpropLstmNonlinearity() {
   for (int i = 0; i < 3; i++) {
     int32 num_rows = 1 + Rand() % 200;
-    int32 cell_dim = 1 + Rand() % 2000;
+    int32 cell_dim = 1 + Rand() % 2000,
+       dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2);
 //    KALDI_LOG << num_rows << ", " << cell_dim;
 
-    Matrix<Real> hinput(num_rows, 5 * cell_dim);
+    Matrix<Real> hinput(num_rows, 5 * cell_dim + dropout_dim);
     Matrix<Real> hparams(3, cell_dim);
     Matrix<Real> houtput_deriv(num_rows, 2 * cell_dim);
     Matrix<double> hderiv_sum_in(5, cell_dim);
     Vector<Real> hself_repair_config(10);
     double count_in;
-    Matrix<Real> hinput_deriv(num_rows, 5 * cell_dim);
+    Matrix<Real> hinput_deriv(num_rows, 5 * cell_dim + dropout_dim);
     Matrix<Real> hparams_deriv(3, cell_dim);
     Matrix<double> hvalue_sum_out(5, cell_dim);
     Matrix<double> hderiv_sum_out(5, cell_dim);
@@ -409,15 +412,16 @@ static void UnitTestBackpropLstmNonlinearity() {
     BaseFloat time_in_secs = 0.025;
     int32 num_rows = i;
     int32 cell_dim = i;
+    int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2);
 
-    CuMatrix<Real> input(num_rows, 5 * cell_dim);
+    CuMatrix<Real> input(num_rows, 5 * cell_dim + dropout_dim);
     CuMatrix<Real> params(3, cell_dim);
     CuMatrix<Real> output_deriv(num_rows, 2 * cell_dim);
     CuMatrix<double> deriv_sum_in(5, cell_dim);
     CuVector<Real> self_repair_config(10);
     double count_in;
 
-    CuMatrix<Real> input_deriv(num_rows, 5 * cell_dim);
+    CuMatrix<Real> input_deriv(num_rows, 5 * cell_dim + dropout_dim);
     CuMatrix<Real> params_deriv(3, cell_dim);
     CuMatrix<double> value_sum_out(5, cell_dim);
     CuMatrix<double> deriv_sum_out(5, cell_dim);
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index bb55302313a..b76721fcce3 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -317,10 +317,11 @@ template<typename Real>
 void CpuComputeLstmNonlinearity(const MatrixBase<Real> &input_mat,
                                 const MatrixBase<Real> &params_mat,
                                 MatrixBase<Real> *output) {
-  int32 num_rows = input_mat.NumRows();
-  int32 cell_dim = input_mat.NumCols() / 5;
+  int32 num_rows = input_mat.NumRows(),
+      input_cols = input_mat.NumCols(),
+        cell_dim = input_cols / 5;
+  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 2);
   KALDI_ASSERT(output->NumRows() == num_rows);
-  KALDI_ASSERT(input_mat.NumCols() % 5 == 0);
   KALDI_ASSERT(params_mat.NumRows() == 3);
   KALDI_ASSERT(params_mat.NumCols() == cell_dim);
   KALDI_ASSERT(output->NumCols() == 2 * cell_dim);
@@ -330,6 +331,10 @@ void CpuComputeLstmNonlinearity(const MatrixBase<Real> &input_mat,
   int32 params_stride = params_mat.Stride();
   for (int32 r = 0; r < num_rows; r++) {
     const Real *input_row = input_mat.RowData(r);
+    // i_scale and f_scale relate to dropout, they will normally be 1.0.
+    Real i_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5]),
+         f_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 1]);
+
     Real *output_row = output_mat.RowData(r);
     for (int32 c = 0; c < cell_dim; c++) {
       Real i_part = input_row[c];
@@ -342,7 +347,7 @@ void CpuComputeLstmNonlinearity(const MatrixBase<Real> &input_mat,
       Real w_oc = params_data[c + params_stride * 2];
       Real i_t = ScalarSigmoid(i_part + w_ic * c_prev);
       Real f_t = ScalarSigmoid(f_part + w_fc * c_prev);
-      Real c_t = f_t * c_prev + i_t * ScalarTanh(c_part);
+      Real c_t = f_t * f_scale * c_prev + i_t * i_scale * ScalarTanh(c_part);
       Real o_t = ScalarSigmoid(o_part + w_oc * c_t);
       Real m_t = o_t * ScalarTanh(c_t);
       output_row[c] = c_t;
@@ -355,10 +360,11 @@ template<typename Real>
 void ComputeLstmNonlinearity(const CuMatrixBase<Real> &input,
                              const CuMatrixBase<Real> &params,
                              CuMatrixBase<Real> *output) {
-  int32 num_rows = input.NumRows();
-  int32 cell_dim = input.NumCols() / 5;
+  int32 num_rows = input.NumRows(),
+      input_cols = input.NumCols(),
+        cell_dim = input_cols / 5;
+  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 2);
   KALDI_ASSERT(output->NumRows() == num_rows);
-  KALDI_ASSERT(input.NumCols() % 5 == 0);
   KALDI_ASSERT(params.NumRows() == 3);
   KALDI_ASSERT(params.NumCols() == cell_dim);
   KALDI_ASSERT(output->NumCols() == 2 * cell_dim);
@@ -367,6 +373,8 @@ void ComputeLstmNonlinearity(const CuMatrixBase<Real> &input,
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
 
+    int have_dropout_mask = (input_cols == (cell_dim * 5) + 2);
+
     // Each thread block is working on 1 row of the data.
     // It's best that cell dim is a multiple fo CU1DBLOCK
     dim3 dimBlock(CU1DBLOCK);
@@ -374,7 +382,7 @@ void ComputeLstmNonlinearity(const CuMatrixBase<Real> &input,
 
     cuda_lstm_nonlinearity(dimGrid, dimBlock, input.Data(), input.Stride(),
                            params.Data(), params.Stride(), output->Stride(),
-                           cell_dim, num_rows, output->Data());
+                           cell_dim, have_dropout_mask, num_rows, output->Data());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -414,10 +422,12 @@ void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
                                  MatrixBase<double> *value_sum_out,
                                  MatrixBase<double> *deriv_sum_out,
                                  MatrixBase<Real> *self_repair_sum_out) {
-  int32 num_rows = input.NumRows();
-  int32 cell_dim = input.NumCols() / 5;
+  int32 num_rows = input.NumRows(),
+      input_cols = input
+                   .NumCols(),
+        cell_dim = input.NumCols() / 5;
   // Check dimensions.
-  KALDI_ASSERT(input.NumCols() % 5 == 0);
+  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 2);
   KALDI_ASSERT(params.NumRows() == 3);
   KALDI_ASSERT(params.NumCols() == cell_dim);
   KALDI_ASSERT(output_deriv.NumRows() == num_rows);
@@ -512,6 +522,12 @@ void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
           c_part = input_mat(r, c + 2 * cell_dim),
           o_part = input_mat(r, c + 3 * cell_dim),
           c_prev = input_mat(r, c + 4 * cell_dim);
+
+      Real i_scale = (input_cols == cell_dim * 5 ? 1.0 :
+                      input_mat(r, cell_dim * 5)),
+           f_scale = (input_cols == cell_dim * 5 ? 1.0 :
+                      input_mat(r, cell_dim * 5 + 1));
+
       // For greater clarity, we give some of the quantities in the
       // forward equations their own names.
       Real i_t_input = i_part + w_ic * c_prev,
@@ -519,7 +535,7 @@ void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
           f_t_input = f_part + w_fc * c_prev,
           f_t = ScalarSigmoid(f_t_input),
           tanh_c_part = ScalarTanh(c_part),
-          c_t = f_t * c_prev + i_t * tanh_c_part,
+          c_t = f_t * f_scale * c_prev + i_t * i_scale * tanh_c_part,
           o_t_input = o_part + w_oc * c_t,
           o_t = ScalarSigmoid(o_t_input),
           tanh_c_t = ScalarTanh(c_t);
@@ -557,19 +573,19 @@ void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
           - (2.0F * o_t - 1.0F) * o_t_self_repair);
       Real dc_t = ((1.0F - tanh_c_t * tanh_c_t) * dtanh_c_t + dc_t_out
           + do_t_input * w_oc) - tanh_c_t * c_t_self_repair;
-      Real dtanh_c_part = i_t * dc_t;
-      Real df_t = dc_t * c_prev;
-      Real df_t_input = (df_t * f_t * (1.0F - f_t)
-          - (2.0F * f_t - 1.0F) * f_t_self_repair);
-      Real di_t = dc_t * tanh_c_part;
-      Real di_t_input = (di_t * i_t * (1.0F - i_t)
-          - (2.0F * i_t - 1.0F) * i_t_self_repair);
+      Real dtanh_c_part = i_t * i_scale * dc_t;
+      Real df_t = dc_t * f_scale * c_prev;
+      Real df_t_input = ((df_t * f_t * (1.0F - f_t)
+                          - (2.0F * f_t - 1.0F) * f_t_self_repair));
+      Real di_t = dc_t * i_scale * tanh_c_part;
+      Real di_t_input = ((di_t * i_t * (1.0F - i_t)
+                          - (2.0F * i_t - 1.0F) * i_t_self_repair));
 
       w_ic_deriv_sum += c_prev * di_t_input;
       w_fc_deriv_sum += c_prev * df_t_input;
       w_oc_deriv_sum += c_t * do_t_input;
 
-      Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * dc_t;
+      Real dc_prev = w_ic * di_t_input + w_fc * df_t_input + f_t * f_scale * dc_t;
       Real do_part = do_t_input;
       Real dc_part = ((1.0F - tanh_c_part * tanh_c_part) * dtanh_c_part
           - tanh_c_part * c_part_self_repair);
@@ -630,10 +646,11 @@ void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
                               CuMatrixBase<double> *value_sum_out,
                               CuMatrixBase<double> *deriv_sum_out,
                               CuMatrixBase<Real> *self_repair_sum_out) {
-  int32 num_rows = input.NumRows();
-  int32 cell_dim = input.NumCols() / 5;
+  int32 num_rows = input.NumRows(),
+        cell_dim = input.NumCols() / 5,
+      input_cols = input.NumCols();
   // Check dimensions.
-  KALDI_ASSERT(input.NumCols() % 5 == 0);
+  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim*5) + 2);
   KALDI_ASSERT(params.NumRows() == 3);
   KALDI_ASSERT(params.NumCols() == cell_dim);
   KALDI_ASSERT(output_deriv.NumRows() == num_rows);
@@ -668,6 +685,7 @@ void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
     // Each thread block is working on 1 row of the data.
     // It's best that cell dim is a multiple fo CU1DBLOCK
 
+    int have_dropout_mask = (input_cols == (cell_dim * 5) + 2);
 
     // Use 2D block (8x32 threads) as we need to compute column sum.
     // Use 1D grid to cover the data matrix width `cell_dim`.
@@ -681,7 +699,8 @@ void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
     dim3 dimGrid(n_blocks(cell_dim, dimBlock.x));
     if (input_deriv == NULL) {
       if (params_deriv == NULL) {
-        cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows,
+        cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
+                                    have_dropout_mask, num_rows,
                                     input.Data(), input.Stride(), params.Data(),
                                     params.Stride(), output_deriv.Data(),
                                     output_deriv.Stride(), deriv_sum_in.Data(),
@@ -699,7 +718,8 @@ void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
                                     0);
 
       } else {
-        cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows,
+        cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
+                                    have_dropout_mask, num_rows,
                                     input.Data(), input.Stride(), params.Data(),
                                     params.Stride(), output_deriv.Data(),
                                     output_deriv.Stride(), deriv_sum_in.Data(),
@@ -717,7 +737,8 @@ void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
       }
     } else {
       if (params_deriv == NULL) {
-        cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows,
+        cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
+                                    have_dropout_mask, num_rows,
                                     input.Data(), input.Stride(), params.Data(),
                                     params.Stride(), output_deriv.Data(),
                                     output_deriv.Stride(), deriv_sum_in.Data(),
@@ -727,7 +748,8 @@ void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
                                     NULL,
                                     0, NULL, 0, NULL, 0, NULL, 0);
       } else {
-        cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim, num_rows,
+        cuda_diff_lstm_nonlinearity(dimGrid, dimBlock, cell_dim,
+                                    have_dropout_mask, num_rows,
                                     input.Data(), input.Stride(), params.Data(),
                                     params.Stride(), output_deriv.Data(),
                                     output_deriv.Stride(), deriv_sum_in.Data(),
diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h
index 9952ca5b9d2..3313baaa9d1 100644
--- a/src/cudamatrix/cu-math.h
+++ b/src/cudamatrix/cu-math.h
@@ -88,6 +88,9 @@ void Group2norm(const CuMatrixBase<Real> &src,
                      a multiple of 5).  The column-space is interpreted as 5
                      consecutive blocks, each of dimension C, which we name:
                      (i_part, f_part, c_part, o_part, c_{t-1}).
+                     This function will also accept input of dimension N by 5C+2,
+                     and the two final elements will be used as scaling factors
+                     on i_t and f_t (useful as per-frame dropout masks).
  @param [in] params  A matrix, of dimension 3 by C, with rows containing the three
                      diagonal parameter matrices used in LSTMs, namely
                      w_{ic}, w_{fc} and w_{oc}.
@@ -101,7 +104,6 @@ void Group2norm(const CuMatrixBase<Real> &src,
                      o_t = Sigmoid(o_part + w_{oc}*c_t)
                      m_t = o_t * Tanh(c_t)
 
-
  */
 template<typename Real>
 void ComputeLstmNonlinearity(const CuMatrixBase<Real> &input,
@@ -134,6 +136,9 @@ void CpuComputeLstmNonlinearity(const MatrixBase<Real> &input,
                      a multiple of 5).  The column-space is interpreted as 5
                      consecutive blocks, each of dimension C, which we name:
                      (i_part, f_part, c_part, o_part, c_{t-1}).
+                     This function will also accept input of dimension N by 5C+2,
+                     and the two final elements will be interpreted as scaling factors
+                     on i_t and f_t (useful as per-frame dropout masks).
  @param [in] params  The same as in ComputeLstmNonlinearity().
                      A matrix, of dimension 3 by C, with rows containing the three
                      diagonal parameter matrices used in LSTMs, namely
@@ -165,9 +170,13 @@ void CpuComputeLstmNonlinearity(const MatrixBase<Real> &input,
                      May be NULL; if not, this function writes, to this
                      location, the backpropagated derivative of the objective
                      function w.r.t. the 'input' matrix.  This matrix should
-                     have the same dimension as 'input' i.e.  N by 5C.  In
-                     addition to the regular backpropagated derivative, the
-                     output will include small values relating to 'self-repair'.
+                     have the same dimension as 'input'.  In addition to the
+                     regular backpropagated derivative, the output will include
+                     small values relating to 'self-repair'.  If the input
+                     is of column-dimension  5C + 2 (i.e. we are using dropout
+                     masks), the derivatives w.r.t. the dropout masks will not
+                     be set; they will retain their value prior to this
+                     function call.
  @param [out] params_deriv
                      May be NULL; if not, this is where this function *writes*
                      [not adds] the backpropagated derivative of the objective
@@ -196,23 +205,6 @@ void CpuComputeLstmNonlinearity(const MatrixBase<Real> &input,
                      processed outside this function into self-repair stats for
                      diagnostics.
 */
-/// Normalize nonlinearity modifies the vector of activations
-/// by scaling it so that the root-mean-square equals 1.0.
-///
-/// The output y_i = scale * x_i,
-/// and we want to RMS value of the y_i to equal target_rms,
-/// so y^t y = D * target_rms^2 (if y is one row of the input).
-/// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)).
-/// there is also flooring involved, to avoid division-by-zero
-/// problems.  It's important for the backprop, that the floor's
-/// square root is exactly representable as float.
-/// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D)))
-/// is an extra dimension of the output.
-template<typename Real>
-void NormalizePerRow(const CuMatrixBase<Real>& in, const Real target_rms,
-                     const bool add_log_stddev, CuMatrixBase<Real>* out);
-
-
 
 template<typename Real>
 void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
@@ -241,6 +233,25 @@ void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
                                  MatrixBase<double> *deriv_sum_out,
                                  MatrixBase<Real> *self_repair_sum_out);
 
+
+/// Normalize nonlinearity modifies the vector of activations
+/// by scaling it so that the root-mean-square equals 1.0.
+///
+/// The output y_i = scale * x_i,
+/// and we want to RMS value of the y_i to equal target_rms,
+/// so y^t y = D * target_rms^2 (if y is one row of the input).
+/// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)).
+/// there is also flooring involved, to avoid division-by-zero
+/// problems.  It's important for the backprop, that the floor's
+/// square root is exactly representable as float.
+/// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D)))
+/// is an extra dimension of the output.
+template<typename Real>
+void NormalizePerRow(const CuMatrixBase<Real>& in, const Real target_rms,
+                     const bool add_log_stddev, CuMatrixBase<Real>* out);
+
+
+
 } // namespace cu
 } // namespace kaldi
 
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index 23a8662a0d5..4a2a8d1c09a 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -147,6 +147,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new ConstantComponent();
   } else if (component_type == "DropoutComponent") {
     ans = new DropoutComponent();
+  } else if (component_type == "DropoutMaskComponent") {
+    ans = new DropoutMaskComponent();
   } else if (component_type == "BackpropTruncationComponent") {
     ans = new BackpropTruncationComponent();
   } else if (component_type == "LstmNonlinearityComponent") {
diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
index c1732fc9b25..7cf438a025e 100644
--- a/src/nnet3/nnet-component-itf.h
+++ b/src/nnet3/nnet-component-itf.h
@@ -82,8 +82,11 @@ enum ComponentProperties {
                              // Tanh, Sigmoid, ReLU and Softmax).
   kInputContiguous = 0x1000,  // true if the component requires its input data (and
                               // input derivatives) to have Stride()== NumCols().
-  kOutputContiguous = 0x2000  // true if the component requires its input data (and
+  kOutputContiguous = 0x2000,  // true if the component requires its input data (and
                               // output derivatives) to have Stride()== NumCols().
+  kRandomComponent = 0x4000   // true if the component has some kind of
+                              // randomness, like DropoutComponent (these should
+                              // inherit from class RandomComponent.
 };
 
 
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index 926ebd9b07d..6ff68525d55 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1374,5 +1374,105 @@ void ConstantComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
 
 
 
+std::string DropoutMaskComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type()
+         << ", output-dim=" << output_dim_
+         << ", dropout-proportion=" << dropout_proportion_
+         << ", exclusive=" << (exclusive_ ? "true" : "false");
+  return stream.str();
+}
+
+DropoutMaskComponent::DropoutMaskComponent():
+    output_dim_(-1), dropout_proportion_(0.5),
+    exclusive_(false) { }
+
+DropoutMaskComponent::DropoutMaskComponent(
+    const DropoutMaskComponent &other):
+    output_dim_(other.output_dim_),
+    dropout_proportion_(other.dropout_proportion_),
+    exclusive_(other.exclusive_) { }
+
+void DropoutMaskComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(in.NumRows() == 0 && out->NumCols() == output_dim_);
+  BaseFloat dropout_proportion = dropout_proportion_;
+  KALDI_ASSERT(dropout_proportion >= 0.0 && dropout_proportion <= 1.0);
+
+  if (dropout_proportion_ == 0) {
+    out->Set(1.0);
+    return;
+  }
+  if (!exclusive_) {
+    const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
+    out->Add(-dropout_proportion);
+    out->ApplyHeaviside();
+  } else {
+    if (!(output_dim_ == 2 && dropout_proportion <= 0.5)) {
+      KALDI_ERR << "If exclusive=true is set, output-dim must equal 2 (got: "
+                << output_dim_ << " and dropout-proportion must <= 0.5 (got: "
+                << dropout_proportion;
+    }
+    // To generate data where it's never the case that both of the dimensions
+    // for a row are zero, we generate uniformly distributed data (call this u_i),
+    // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1)
+    //                and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1)
+    int32 num_rows = out->NumRows();
+    // later we may make this a bit more efficient.
+    CuVector<BaseFloat> temp(num_rows, kUndefined);
+    const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&temp);
+    temp.Add(-dropout_proportion);
+    out->CopyColFromVec(temp, 0);
+    temp.Add(-1.0 + (2.0 * dropout_proportion));
+    // Now, 'temp' contains the original uniformly-distributed data plus
+    // -(1 - dropout_proportion).
+    temp.Scale(-1.0);
+    out->CopyColFromVec(temp, 1);
+    out->ApplyHeaviside();
+  }
+}
+
+
+void DropoutMaskComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<DropoutMaskComponent>", "<OutputDim>");
+  ReadBasicType(is, binary, &output_dim_);
+  ExpectToken(is, binary, "<DropoutProportion>");
+  ReadBasicType(is, binary, &dropout_proportion_);
+  ExpectToken(is, binary, "<Exclusive>");
+  ReadBasicType(is, binary, &exclusive_);
+  ExpectToken(is, binary, "</DropoutMaskComponent>");
+}
+
+
+void DropoutMaskComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<DropoutMaskComponent>");
+  WriteToken(os, binary, "<OutputDim>");
+  WriteBasicType(os, binary, output_dim_);
+  WriteToken(os, binary, "<DropoutProportion>");
+  WriteBasicType(os, binary, dropout_proportion_);
+  WriteToken(os, binary, "<Exclusive>");
+  WriteBasicType(os, binary, exclusive_);
+  WriteToken(os, binary, "</DropoutMaskComponent>");
+}
+
+Component* DropoutMaskComponent::Copy() const {
+  return new DropoutMaskComponent(*this);
+}
+
+void DropoutMaskComponent::InitFromConfig(ConfigLine *cfl) {
+  output_dim_ = 0;
+  bool ok = cfl->GetValue("output-dim", &output_dim_);
+  KALDI_ASSERT(ok && output_dim_ > 0);
+  dropout_proportion_ = 0.5;
+  cfl->GetValue("dropout-proportion", &dropout_proportion_);
+  exclusive_ = false;
+  cfl->GetValue("exclusive", &exclusive_);
+}
+
+
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index b945edf4475..d3de9f40548 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -669,6 +669,95 @@ class ConstantComponent: public UpdatableComponent {
 
 
 
+// DropoutMaskComponent outputs a random zero-or-one value for all dimensions of
+// all requested indexes, and it has no dependencies on any input.  It's like a
+// ConstantComponent, but with random output that has value zero
+// a proportion (dropout_proportion) of the time, and otherwise one.
+// This is not the normal way to implement dropout; you'd normally use a
+// DropoutComponent (see nnet-simple-component.h).  This component is used while
+// implementing per-frame dropout with the LstmNonlinearityComponent; we
+// generate a two-dimensional output representing dropout
+//
+class DropoutMaskComponent: public RandomComponent {
+ public:
+  // actually this component requires no inputs; this value
+  // is really a don't-care.
+  virtual int32 InputDim() const { return output_dim_; }
+
+  virtual int32 OutputDim() const { return output_dim_; }
+
+  virtual std::string Info() const;
+
+  // possible parameter values with their defaults:
+  // dropout-proportion=0.5 output-dim=-1 exclusive=false
+  // [for the meaning of 'exclusive', see its declaration].
+  virtual void InitFromConfig(ConfigLine *cfl);
+
+  DropoutMaskComponent();
+
+  DropoutMaskComponent(const DropoutMaskComponent &other);
+
+  virtual std::string Type() const { return "DropoutMaskComponent"; }
+  virtual int32 Properties() const { return kRandomComponent; }
+  // note: the matrix 'in' will be empty.
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  // backprop does nothing, there is nothing to backprop to and nothing
+  // to update.
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &, // in_value
+                        const CuMatrixBase<BaseFloat> &, // out_value
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const { }
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+  // Some functions that are only to be reimplemented for GeneralComponents.
+  virtual void GetInputIndexes(const MiscComputationInfo &misc_info,
+                               const Index &output_index,
+                               std::vector<Index> *desired_indexes) const {
+    desired_indexes->clear();  // requires no inputs.
+  }
+
+  // This function returns true if at least one of the input indexes used to
+  // compute this output index is computable.
+  // it's simple because this component requires no inputs.
+  virtual bool IsComputable(const MiscComputationInfo &misc_info,
+                            const Index &output_index,
+                            const IndexSet &input_index_set,
+                            std::vector<Index> *used_inputs) const {
+    if (used_inputs) used_inputs->clear();
+    return true;
+  }
+
+  void SetDropoutProportion(BaseFloat p) { dropout_proportion_ = p; }
+
+ private:
+
+  // The output dimension
+  int32 output_dim_;
+
+  BaseFloat dropout_proportion_;
+
+  // If true, and only in the special case where output_dim_ == 2, this
+  // component will make sure that it's never the case that both columns of a
+  // row of the output are zero.  Note: if this is true, you cannot set
+  // dropout_proportion_ > 0.5.
+  bool exclusive_;
+
+  const DropoutMaskComponent &operator
+  = (const DropoutMaskComponent &other); // Disallow.
+};
+
+
+
+
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 527b0d54c01..1cde6b3b0fa 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -4969,13 +4969,20 @@ void CompositeComponent::InitFromConfig(ConfigLine *cfl) {
     if(this_component->Type() == "CompositeComponent") {
       DeletePointers(&components);
       delete this_component;
+      // This is not allowed.  If memory is too much with just one
+      // CompositeComponent, try decreasing max-rows-process instead.
       KALDI_ERR << "Found CompositeComponent nested within CompositeComponent."
-                << "Try decreasing max-rows-process instead."
                 << "Nested line: '" << nested_line.WholeLine() << "'\n"
                 << "Toplevel CompositeComponent line '" << cfl->WholeLine()
                 << "'";
     }
     this_component->InitFromConfig(&nested_line);
+    int32 props = this_component->Properties();
+    if ((props & kRandomComponent) != 0 ||
+        (props & kSimpleComponent) == 0) {
+      KALDI_ERR << "CompositeComponent contains disallowed component type: "
+                << nested_line.WholeLine();
+    }
     components.push_back(this_component);
   }
   if (cfl->HasUnusedValues())
@@ -4995,10 +5002,9 @@ void CompositeComponent::SetComponent(int32 i, Component *component) {
   components_[i] = component;
 }
 
-
 int32 LstmNonlinearityComponent::InputDim() const {
   int32 cell_dim = value_sum_.NumCols();
-  return cell_dim * 5;
+  return cell_dim * 5 + (use_dropout_ ? 2 : 0);
 }
 
 int32 LstmNonlinearityComponent::OutputDim() const {
@@ -5020,7 +5026,15 @@ void LstmNonlinearityComponent::Read(std::istream &is, bool binary) {
   ExpectToken(is, binary, "<SelfRepairProb>");
   self_repair_total_.Read(is, binary);
 
-  ExpectToken(is, binary, "<Count>");
+  std::string tok;
+  ReadToken(is, binary, &tok);
+  if (tok == "<UseDropout>") {
+    ReadBasicType(is, binary, &use_dropout_);
+    ReadToken(is, binary, &tok);
+  } else {
+    use_dropout_ = false;
+  }
+  KALDI_ASSERT(tok == "<Count>");
   ReadBasicType(is, binary, &count_);
 
   // For the on-disk format, we normalze value_sum_, deriv_sum_ and
@@ -5067,6 +5081,8 @@ void LstmNonlinearityComponent::Write(std::ostream &os, bool binary) const {
       self_repair_prob.Scale(1.0 / (count_ * cell_dim));
     self_repair_prob.Write(os, binary);
   }
+  WriteToken(os, binary, "<UseDropout>");
+  WriteBasicType(os, binary, use_dropout_);
   WriteToken(os, binary, "<Count>");
   WriteBasicType(os, binary, count_);
   WriteToken(os, binary, "</LstmNonlinearityComponent>");
@@ -5077,7 +5093,8 @@ void LstmNonlinearityComponent::Write(std::ostream &os, bool binary) const {
 std::string LstmNonlinearityComponent::Info() const {
   std::ostringstream stream;
   int32 cell_dim = params_.NumCols();
-  stream << UpdatableComponent::Info() << ", cell-dim=" << cell_dim;
+  stream << UpdatableComponent::Info() << ", cell-dim=" << cell_dim
+         << ", use-dropout=" << (use_dropout_ ? "true" : "false");
   PrintParameterStats(stream, "w_ic", params_.Row(0));
   PrintParameterStats(stream, "w_fc", params_.Row(1));
   PrintParameterStats(stream, "w_oc", params_.Row(2));
@@ -5243,6 +5260,7 @@ LstmNonlinearityComponent::LstmNonlinearityComponent(
     const LstmNonlinearityComponent &other):
     UpdatableComponent(other),
     params_(other.params_),
+    use_dropout_(other.use_dropout_),
     value_sum_(other.value_sum_),
     deriv_sum_(other.deriv_sum_),
     self_repair_config_(other.self_repair_config_),
@@ -5251,7 +5269,8 @@ LstmNonlinearityComponent::LstmNonlinearityComponent(
     preconditioner_(other.preconditioner_) { }
 
 void LstmNonlinearityComponent::Init(
-    int32 cell_dim, BaseFloat param_stddev,
+    int32 cell_dim, bool use_dropout,
+    BaseFloat param_stddev,
     BaseFloat tanh_self_repair_threshold,
     BaseFloat sigmoid_self_repair_threshold,
     BaseFloat self_repair_scale) {
@@ -5261,6 +5280,7 @@ void LstmNonlinearityComponent::Init(
                sigmoid_self_repair_threshold >= 0.0 &&
                sigmoid_self_repair_threshold <= 0.25 &&
                self_repair_scale >= 0.0 && self_repair_scale <= 0.1);
+  use_dropout_ = use_dropout;
   params_.Resize(3, cell_dim);
   params_.SetRandn();
   params_.Scale(param_stddev);
@@ -5295,6 +5315,7 @@ void LstmNonlinearityComponent::InitNaturalGradient() {
 void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
   InitLearningRatesFromConfig(cfl);
   bool ok = true;
+  bool use_dropout = false;
   int32 cell_dim;
   // these self-repair thresholds are the normal defaults for tanh and sigmoid
   // respectively.  If, later on, we decide that we want to support different
@@ -5314,6 +5335,7 @@ void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
   cfl->GetValue("sigmoid-self-repair-threshold",
                 &sigmoid_self_repair_threshold);
   cfl->GetValue("self-repair-scale", &self_repair_scale);
+  cfl->GetValue("use-dropout", &use_dropout);
 
   // We may later on want to make it possible to initialize the different
   // parameters w_ic, w_fc and w_oc with different biases.  We'll implement
@@ -5323,7 +5345,7 @@ void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
     KALDI_ERR << "Could not process these elements in initializer: "
               << cfl->UnusedValues();
   if (ok) {
-    Init(cell_dim, param_stddev, tanh_self_repair_threshold,
+    Init(cell_dim, use_dropout, param_stddev, tanh_self_repair_threshold,
          sigmoid_self_repair_threshold, self_repair_scale);
   } else {
     KALDI_ERR << "Invalid initializer for layer of type "
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index 62b4c9006d8..ea5df928b37 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -99,7 +99,8 @@ class DropoutComponent : public RandomComponent {
                       dropout_per_frame_(false) { }
 
   virtual int32 Properties() const {
-    return kLinearInInput|kBackpropInPlace|kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput;
+    return kLinearInInput|kBackpropInPlace|kSimpleComponent|kBackpropNeedsInput|
+        kBackpropNeedsOutput|kRandomComponent;
   }
   virtual std::string Type() const { return "DropoutComponent"; }
 
@@ -1677,8 +1678,9 @@ class ConvolutionComponent: public UpdatableComponent {
 // o_part = W_{cx} x_t + W_{om} m_{t-1} + b_o
 //
 // The part of the computation that takes place in this component is as follows.
-// Its input is of dimension 5C, consisting of 5 blocks: (i_part, f_part, c_part, o_part, and
-// c_{t-1}).  Its output is of dimension 2C, consisting of 2 blocks: c_t and m_t.
+// Its input is of dimension 5C [however, search for 'dropout' below],
+// consisting of 5 blocks: (i_part, f_part, c_part, o_part, and c_{t-1}).  Its
+// output is of dimension 2C, consisting of 2 blocks: c_t and m_t.
 //
 // To recap: the input is (i_part, f_part, c_part, o_part, c_{t-1}); the output is (c_t, m_t).
 //
@@ -1696,6 +1698,12 @@ class ConvolutionComponent: public UpdatableComponent {
 //    m_t = o_t * Tanh(c_t)                    (5)
 //   # note: the outputs are just c_t and m_t.
 //
+// [Note regarding dropout: optionally the input-dimension may be 5C + 2 instead
+// of 5C in this case, the last two input dimensions will be interpreted as
+// per-frame dropout masks on i_t and f_t respectively, so that in (3), i_t is
+// replaced by i_t * i_t_scale, and likewise for f_t.
+//
+//
 // The backprop is as you would think, but for the "self-repair" we need to pass
 // in additional vectors (of the same dim as the parameters of the layer) that
 // dictate whether or not we add an additional term to the backpropagated
@@ -1715,7 +1723,7 @@ class LstmNonlinearityComponent: public UpdatableComponent {
   virtual int32 OutputDim() const;
   virtual std::string Info() const;
   virtual void InitFromConfig(ConfigLine *cfl);
-  LstmNonlinearityComponent() { } // use Init to really initialize.
+  LstmNonlinearityComponent(): use_dropout_(false) { }
   virtual std::string Type() const { return "LstmNonlinearityComponent"; }
   virtual int32 Properties() const {
     return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput;
@@ -1751,15 +1759,12 @@ class LstmNonlinearityComponent: public UpdatableComponent {
   explicit LstmNonlinearityComponent(
       const LstmNonlinearityComponent &other);
 
-  void Init(int32 cell_dim, BaseFloat param_stddev,
+  void Init(int32 cell_dim, bool use_dropout,
+            BaseFloat param_stddev,
             BaseFloat tanh_self_repair_threshold,
             BaseFloat sigmoid_self_repair_threshold,
             BaseFloat self_repair_scale);
 
-  void Init(std::string vector_filename,
-            int32 rank, int32 update_period, BaseFloat num_samples_history,
-            BaseFloat alpha, BaseFloat max_change_per_minibatch);
-
  private:
 
   // Initializes the natural-gradient object with the configuration we
@@ -1773,6 +1778,10 @@ class LstmNonlinearityComponent: public UpdatableComponent {
   // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
   CuMatrix<BaseFloat> params_;
 
+  // If true, we expect an extra 2 dimensions on the input, for dropout masks
+  // for i_t and f_t.
+  bool use_dropout_;
+
   // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
   // equations (1) through (5), this is the sum of the values of the nonliearities
   // (used for diagnostics only).  It is comparable to value_sum_ vector
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index a7f732a9864..27415fe8775 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -21,6 +21,7 @@
 #include "nnet3/nnet-utils.h"
 #include "nnet3/nnet-graph.h"
 #include "nnet3/nnet-simple-component.h"
+#include "nnet3/nnet-general-component.h"
 #include "nnet3/nnet-parse.h"
 
 namespace kaldi {
@@ -461,6 +462,10 @@ void SetDropoutProportion(BaseFloat dropout_proportion,
     DropoutComponent *dc = dynamic_cast<DropoutComponent*>(comp);
     if (dc != NULL)
       dc->SetDropoutProportion(dropout_proportion);
+    DropoutMaskComponent *mc =
+        dynamic_cast<DropoutMaskComponent*>(nnet->GetComponent(c));
+    if (mc != NULL)
+      mc->SetDropoutProportion(dropout_proportion);
   }
 }
 
@@ -629,16 +634,20 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
         KALDI_ERR << "In edits-config, expected proportion to be set in line: "
                   << config_line.WholeLine();
       }
-      DropoutComponent *dropout_component = NULL;
       int32 num_dropout_proportions_set = 0;
       for (int32 c = 0; c < nnet->NumComponents(); c++) {
         if (NameMatchesPattern(nnet->GetComponentName(c).c_str(),
-                               name_pattern.c_str()) &&
-            (dropout_component =
-             dynamic_cast<DropoutComponent*>(nnet->GetComponent(c)))) {
+                               name_pattern.c_str())) {
+          DropoutComponent *dropout_component =
+             dynamic_cast<DropoutComponent*>(nnet->GetComponent(c));
+          DropoutMaskComponent *mask_component =
+             dynamic_cast<DropoutMaskComponent*>(nnet->GetComponent(c));
           if (dropout_component != NULL) {
             dropout_component->SetDropoutProportion(proportion);
             num_dropout_proportions_set++;
+          } else if (mask_component != NULL){
+            mask_component->SetDropoutProportion(proportion);
+            num_dropout_proportions_set++;
           }
         }
       }
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 3bda01271d2..0ed5aa0d5c5 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -160,7 +160,7 @@ void ConvertRepeatedToBlockAffine(Nnet *nnet);
 /// Info() function (we need this in the CTC code).
 std::string NnetInfo(const Nnet &nnet);
 
-/// This function sets the dropout proportion in all dropout component to
+/// This function sets the dropout proportion in all dropout components to
 /// dropout_proportion value.
 void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet);
 

From 863534b0ec80ef33088183a2509965782e44e5e6 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 30 Jan 2017 23:11:16 -0500
Subject: [PATCH 02/21] [egs] Small fixes/additions in Swbd/s5c chain scripts

---
 .../local/chain/tuning/run_tdnn_lstm_1e.sh    |   4 +-
 .../local/chain/tuning/run_tdnn_lstm_1g.sh    | 261 ++++++++++++++++++
 2 files changed, 263 insertions(+), 2 deletions(-)
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
index bf93b156974..14dbb1cdd2e 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -242,11 +242,11 @@ if [ $stage -le 16 ]; then
          --nj 50 --cmd "$decode_cmd" --iter $decode_iter \
          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
          $graph_dir data/${decode_set}_hires \
-         $dir/decode_${decode_set}_sw1_tg || exit 1;
+         $dir/decode_${decode_set}_sw1_tg_looped || exit 1;
       if $has_fisher; then
           steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
             data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_looped || exit 1;
       fi
       ) &
   done
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
new file mode 100755
index 00000000000..6cacdf2dadb
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -0,0 +1,261 @@
+#!/bin/bash
+
+# 1g is like 1e, but reducing decay-time from 20 to 15, to see if
+# it reduces the difference between regular and looped decoding.
+#
+# run_tdnn_lstm_1e.sh is like run_tdnn_lstm_1d.sh but
+# trying the change of xent_regularize from 0.025 (which was an
+# unusual value) to the more usual 0.01.
+
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1g # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=final
+
+# training options
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_lstm_1d_sp/egs
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=15"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+        steps/nnet3/decode.sh --num-threads 4 \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 25 --cmd "$decode_cmd" --iter $decode_iter \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+if [ $stage -le 16 ]; then
+  # looped decoding.  Note: this does not make sense for BLSTMs or other
+  # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
+  # point doing it because it would give identical results to regular decoding.
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj 50 --cmd "$decode_cmd" --iter $decode_iter \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+
+
+exit 0;

From eb0f45819a33c79d56b99302d94d276c5921258d Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 31 Jan 2017 14:13:59 -0500
Subject: [PATCH 03/21] [src,egs,scripts] Modifying dropout in LSTM to be on
 (i,f,o) gates not just (i,f); test on tedlium.

---
 .../local/chain/tuning/run_tdnn_lstm_1p.sh    | 21 +++++++++++++++-
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py   |  6 ++---
 src/cudamatrix/cu-kernels.cu                  | 23 ++++++++++-------
 src/cudamatrix/cu-math-test.cc                | 10 ++++----
 src/cudamatrix/cu-math.cc                     | 25 +++++++++++--------
 src/cudamatrix/cu-math.h                      | 14 +++++------
 src/nnet3/nnet-simple-component.cc            | 10 +++++---
 src/nnet3/nnet-simple-component.h             |  6 ++---
 8 files changed, 73 insertions(+), 42 deletions(-)

diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh
index 246601d8535..f06f4a7f6ec 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh
@@ -4,6 +4,25 @@
 # did it in the non-fast LSTMs, with separate per-frame masks on
 # the i and f component.  Using dropout schedule that maxes out at
 # 0.3, which he found worked best for that type of dropout.
+
+# [See about 20 lines below for the original comparison with the baseline,
+#  done when "p" was dropping out 2 gates [the i and f gates].
+#  The comparison directly below is between the version that dropped out
+#  2 gates (p) with the one that dropped out 3 gates (p2).  No consistent
+#  difference there.]
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi
+#_sp_bi
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi
+# System                tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi
+# WER on dev(orig)            8.9       8.7
+# WER on dev(rescored)        8.4       8.2
+# WER on test(orig)           8.7       8.8
+# WER on test(rescored)       8.1       8.3
+# Final train prob        -0.0712   -0.0717
+# Final valid prob        -0.0848   -0.0834
+# Final train prob (xent)   -0.8903   -0.9147
+# Final valid prob (xent)   -0.9719   -0.9977
+
 #
 #
 # local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi
@@ -95,7 +114,7 @@ frames_per_chunk_primary=140
 # are just hardcoded at this level, in the commands below.
 train_stage=-10
 tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_lstm_affix=1p  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1p2  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
 common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
 
 # End configuration section.
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index f6d93808538..ac2deb7ecd6 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -809,9 +809,9 @@ def generate_lstm_config(self):
         configs.append("component name={0}.cr_trunc type=BackpropTruncationComponent "
                        "dim={1} {2}".format(name, cell_dim + rec_proj_dim, bptrunc_str))
         if dropout_proportion != -1.0:
-            configs.append("component name={0}.dropout_mask type=DropoutMaskComponent output-dim=2 "
-                           "dropout-proportion={1} exclusive={2}"
-                           .format(name, dropout_proportion, dropout_exclusive))
+            configs.append("component name={0}.dropout_mask type=DropoutMaskComponent output-dim=3 "
+                           "dropout-proportion={1} "
+                           .format(name, dropout_proportion))
         configs.append("# Component specific to 'projected' LSTM (LSTMP), contains both recurrent");
         configs.append("# and non-recurrent projections")
         configs.append("component name={0}.W_rp type=NaturalGradientAffineComponent input-dim={1} "
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index f50e5853fdd..d9d463d1aca 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -2723,8 +2723,8 @@ static void _diff_log_softmax(const MatrixDim in_deriv_dim,
                      which we name:
                      (i_part, f_part, c_part, o_part, c_{t-1}).
                      If 'have_dropout_mask' is nonzero, each row of
-                     'in' will have two extra elements, interpreted
-                     as dropout masks/scales for i_t and f_t.
+                     'in' will have 3 extra elements, interpreted
+                     as dropout masks/scales for i_t, f_t and o_t.
  @param [in] params  A matrix, of dimension 3 by cell_dim,
                      with rows containing the 3 diagonal parameter matrices
                      used in LSTMs, namely
@@ -2764,7 +2764,8 @@ static void _lstm_nonlinearity(const Real* in, const int in_stride,
   Real* c_t = out + i * out_stride;
   Real* m_t = out + i * out_stride + cell_dim;
   Real i_scale = (have_dropout_mask ? in[i * in_stride + cell_dim * 5] : 1),
-       f_scale = (have_dropout_mask ? in[i * in_stride + cell_dim * 5 + 1] : 1);
+       f_scale = (have_dropout_mask ? in[i * in_stride + cell_dim * 5 + 1] : 1),
+       o_scale = (have_dropout_mask ? in[i * in_stride + cell_dim * 5 + 2] : 1);
 
   for (int j = tid; j < cell_dim; j += CU1DBLOCK) {
     Real c_tm1_j = c_tm1[j];
@@ -2773,7 +2774,7 @@ static void _lstm_nonlinearity(const Real* in, const int in_stride,
     Real c_t_j = f_t_j * f_scale * c_tm1_j + i_t_j * i_scale * tanh(c_part[j]);
     Real o_t_j = Real(1) / (Real(1) + exp(-o_part[j] - w_oc[j] * c_t_j));
     c_t[j] = c_t_j;
-    m_t[j] = o_t_j * tanh(c_t_j);
+    m_t[j] = o_t_j * o_scale * tanh(c_t_j);
   }
 }
 
@@ -2799,8 +2800,8 @@ static void _lstm_nonlinearity(const Real* in, const int in_stride,
                      consecutive blocks, each of dimension C, which we name:
                      (i_part, f_part, c_part, o_part, c_{t-1}).
                      If 'have_dropout_mask' is nonzero, each row of
-                     'in' will have two extra elements, interpreted
-                     as dropout masks/scales for i_t and f_t.
+                     'in' will have 3 extra elements, interpreted
+                     as dropout masks/scales for i_t, f_t and o_t.
  @param [in] params  The same as in ComputeLstmNonlinearity().
                      A matrix, of dimension 3 by C, with rows containing the
                      three diagonal parameter matrices used in LSTMs, namely
@@ -2940,7 +2941,11 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int have_dropout_m
       const Real i_scale = (have_dropout_mask ?
                             input[i * input_stride + cell_dim * 5] : 1),
                  f_scale = (have_dropout_mask ?
-                            input[i * input_stride + cell_dim * 5 + 1] :1);
+                            input[i * input_stride + cell_dim * 5 + 1] :1),
+                 o_scale = (have_dropout_mask ?
+                            input[i * input_stride + cell_dim * 5 + 2] :1);
+
+
       const Real i_t = Real(1) / (1 + exp(-i_part - w_ic * c_prev));
       const Real f_t = Real(1) / (1 + exp(-f_part - w_fc * c_prev));
       const Real tanh_c_part = tanh(c_part);
@@ -2971,8 +2976,8 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int have_dropout_m
       const Real dc_t_out = output_deriv[i * output_deriv_stride + j];
       const Real dm_t = output_deriv[i * output_deriv_stride + j + cell_dim];
 
-      const Real dtanh_c_t = o_t * dm_t;
-      const Real do_t = tanh_c_t * dm_t;
+      const Real dtanh_c_t = o_t * o_scale * dm_t;
+      const Real do_t = o_scale * tanh_c_t * dm_t;
       const Real do_t_input = (o_t_deriv * do_t
           - (2 * o_t - 1) * o_t_self_repair);
 
diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index 9abb6c7e8d1..9854692f356 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -144,7 +144,7 @@ static void UnitTestCuMathComputeLstmNonlinearity() {
   for (int i = 0; i < 3; i++) {
     int32 num_rows = 1 + Rand() % 100;
     int32 cell_dim = 1 + Rand() % 2000;
-    int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2);
+    int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3);
     Matrix<Real> Hinput(num_rows, 5 * cell_dim + dropout_dim);
     Matrix<Real> Hparams(3, cell_dim);
     Matrix<Real> Houtput(num_rows, 2 * cell_dim);
@@ -166,7 +166,7 @@ static void UnitTestCuMathComputeLstmNonlinearity() {
     BaseFloat time_in_secs = 0.025;
     int32 num_rows = i;
     int32 cell_dim = i;
-    int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2);
+    int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3);
     CuMatrix<Real> input(num_rows, 5 * cell_dim + dropout_dim);
     CuMatrix<Real> params(3, cell_dim);
     CuMatrix<Real> output(num_rows, 2 * cell_dim);
@@ -193,7 +193,7 @@ void UnitTestLstmNonlinearity() {
     // problem dimensions.
     int32 num_rows = RandInt(5, 20),
           cell_dim = RandInt(2, 200),
-        dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2);
+        dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3);
 
     // Pick the (input or params block), and output block, for which we'll
     // spot-check the derivative values.  This will give us test failures
@@ -299,7 +299,7 @@ static void UnitTestBackpropLstmNonlinearity() {
   for (int i = 0; i < 3; i++) {
     int32 num_rows = 1 + Rand() % 200;
     int32 cell_dim = 1 + Rand() % 2000,
-       dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2);
+       dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3);
 //    KALDI_LOG << num_rows << ", " << cell_dim;
 
     Matrix<Real> hinput(num_rows, 5 * cell_dim + dropout_dim);
@@ -412,7 +412,7 @@ static void UnitTestBackpropLstmNonlinearity() {
     BaseFloat time_in_secs = 0.025;
     int32 num_rows = i;
     int32 cell_dim = i;
-    int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 2);
+    int32 dropout_dim = (RandInt(0, 1) == 0 ? 0 : 3);
 
     CuMatrix<Real> input(num_rows, 5 * cell_dim + dropout_dim);
     CuMatrix<Real> params(3, cell_dim);
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index b76721fcce3..13b2f450bbb 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -320,7 +320,7 @@ void CpuComputeLstmNonlinearity(const MatrixBase<Real> &input_mat,
   int32 num_rows = input_mat.NumRows(),
       input_cols = input_mat.NumCols(),
         cell_dim = input_cols / 5;
-  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 2);
+  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
   KALDI_ASSERT(output->NumRows() == num_rows);
   KALDI_ASSERT(params_mat.NumRows() == 3);
   KALDI_ASSERT(params_mat.NumCols() == cell_dim);
@@ -333,7 +333,8 @@ void CpuComputeLstmNonlinearity(const MatrixBase<Real> &input_mat,
     const Real *input_row = input_mat.RowData(r);
     // i_scale and f_scale relate to dropout, they will normally be 1.0.
     Real i_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5]),
-         f_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 1]);
+         f_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 1]),
+         o_scale = (input_cols == cell_dim*5 ? 1.0:input_row[cell_dim*5 + 2]);
 
     Real *output_row = output_mat.RowData(r);
     for (int32 c = 0; c < cell_dim; c++) {
@@ -349,7 +350,7 @@ void CpuComputeLstmNonlinearity(const MatrixBase<Real> &input_mat,
       Real f_t = ScalarSigmoid(f_part + w_fc * c_prev);
       Real c_t = f_t * f_scale * c_prev + i_t * i_scale * ScalarTanh(c_part);
       Real o_t = ScalarSigmoid(o_part + w_oc * c_t);
-      Real m_t = o_t * ScalarTanh(c_t);
+      Real m_t = o_t * o_scale * ScalarTanh(c_t);
       output_row[c] = c_t;
       output_row[c + cell_dim] = m_t;
     }
@@ -363,7 +364,7 @@ void ComputeLstmNonlinearity(const CuMatrixBase<Real> &input,
   int32 num_rows = input.NumRows(),
       input_cols = input.NumCols(),
         cell_dim = input_cols / 5;
-  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 2);
+  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
   KALDI_ASSERT(output->NumRows() == num_rows);
   KALDI_ASSERT(params.NumRows() == 3);
   KALDI_ASSERT(params.NumCols() == cell_dim);
@@ -373,7 +374,7 @@ void ComputeLstmNonlinearity(const CuMatrixBase<Real> &input,
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
 
-    int have_dropout_mask = (input_cols == (cell_dim * 5) + 2);
+    int have_dropout_mask = (input_cols == (cell_dim * 5) + 3);
 
     // Each thread block is working on 1 row of the data.
     // It's best that cell dim is a multiple fo CU1DBLOCK
@@ -427,7 +428,7 @@ void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
                    .NumCols(),
         cell_dim = input.NumCols() / 5;
   // Check dimensions.
-  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 2);
+  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim * 5) + 3);
   KALDI_ASSERT(params.NumRows() == 3);
   KALDI_ASSERT(params.NumCols() == cell_dim);
   KALDI_ASSERT(output_deriv.NumRows() == num_rows);
@@ -526,7 +527,9 @@ void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
       Real i_scale = (input_cols == cell_dim * 5 ? 1.0 :
                       input_mat(r, cell_dim * 5)),
            f_scale = (input_cols == cell_dim * 5 ? 1.0 :
-                      input_mat(r, cell_dim * 5 + 1));
+                      input_mat(r, cell_dim * 5 + 1)),
+           o_scale = (input_cols == cell_dim * 5 ? 1.0 :
+                      input_mat(r, cell_dim * 5 + 2));
 
       // For greater clarity, we give some of the quantities in the
       // forward equations their own names.
@@ -567,8 +570,8 @@ void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
       // comes directly from the output of this function.
       Real dc_t_out = output_deriv_mat(r, c);
       Real dm_t = output_deriv_mat(r, c + cell_dim);
-      Real dtanh_c_t = o_t * dm_t;
-      Real do_t = tanh_c_t * dm_t;
+      Real dtanh_c_t = o_t * o_scale * dm_t;
+      Real do_t = o_scale * tanh_c_t * dm_t;
       Real do_t_input = (o_t * (1.0F - o_t) * do_t
           - (2.0F * o_t - 1.0F) * o_t_self_repair);
       Real dc_t = ((1.0F - tanh_c_t * tanh_c_t) * dtanh_c_t + dc_t_out
@@ -650,7 +653,7 @@ void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
         cell_dim = input.NumCols() / 5,
       input_cols = input.NumCols();
   // Check dimensions.
-  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim*5) + 2);
+  KALDI_ASSERT(input_cols == (cell_dim * 5) || input_cols == (cell_dim*5) + 3);
   KALDI_ASSERT(params.NumRows() == 3);
   KALDI_ASSERT(params.NumCols() == cell_dim);
   KALDI_ASSERT(output_deriv.NumRows() == num_rows);
@@ -685,7 +688,7 @@ void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
     // Each thread block is working on 1 row of the data.
     // It's best that cell dim is a multiple fo CU1DBLOCK
 
-    int have_dropout_mask = (input_cols == (cell_dim * 5) + 2);
+    int have_dropout_mask = (input_cols == (cell_dim * 5) + 3);
 
     // Use 2D block (8x32 threads) as we need to compute column sum.
     // Use 1D grid to cover the data matrix width `cell_dim`.
diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h
index 3313baaa9d1..3cc61da1744 100644
--- a/src/cudamatrix/cu-math.h
+++ b/src/cudamatrix/cu-math.h
@@ -88,9 +88,9 @@ void Group2norm(const CuMatrixBase<Real> &src,
                      a multiple of 5).  The column-space is interpreted as 5
                      consecutive blocks, each of dimension C, which we name:
                      (i_part, f_part, c_part, o_part, c_{t-1}).
-                     This function will also accept input of dimension N by 5C+2,
-                     and the two final elements will be used as scaling factors
-                     on i_t and f_t (useful as per-frame dropout masks).
+                     This function will also accept input of dimension N by 5C + 3,
+                     and the three final elements will be used as scaling factors
+                     on i_t, f_t and o_t (useful as per-frame dropout masks).
  @param [in] params  A matrix, of dimension 3 by C, with rows containing the three
                      diagonal parameter matrices used in LSTMs, namely
                      w_{ic}, w_{fc} and w_{oc}.
@@ -136,9 +136,9 @@ void CpuComputeLstmNonlinearity(const MatrixBase<Real> &input,
                      a multiple of 5).  The column-space is interpreted as 5
                      consecutive blocks, each of dimension C, which we name:
                      (i_part, f_part, c_part, o_part, c_{t-1}).
-                     This function will also accept input of dimension N by 5C+2,
-                     and the two final elements will be interpreted as scaling factors
-                     on i_t and f_t (useful as per-frame dropout masks).
+                     This function will also accept input of dimension N by 5C + 3,
+                     and the three final elements will be interpreted as scaling factors
+                     on i_t, f_t and o_t (useful as per-frame dropout masks).
  @param [in] params  The same as in ComputeLstmNonlinearity().
                      A matrix, of dimension 3 by C, with rows containing the three
                      diagonal parameter matrices used in LSTMs, namely
@@ -173,7 +173,7 @@ void CpuComputeLstmNonlinearity(const MatrixBase<Real> &input,
                      have the same dimension as 'input'.  In addition to the
                      regular backpropagated derivative, the output will include
                      small values relating to 'self-repair'.  If the input
-                     is of column-dimension  5C + 2 (i.e. we are using dropout
+                     is of column-dimension  5C + 3 (i.e. we are using dropout
                      masks), the derivatives w.r.t. the dropout masks will not
                      be set; they will retain their value prior to this
                      function call.
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 1cde6b3b0fa..dbb3729ec0d 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -5004,7 +5004,7 @@ void CompositeComponent::SetComponent(int32 i, Component *component) {
 
 int32 LstmNonlinearityComponent::InputDim() const {
   int32 cell_dim = value_sum_.NumCols();
-  return cell_dim * 5 + (use_dropout_ ? 2 : 0);
+  return cell_dim * 5 + (use_dropout_ ? 3 : 0);
 }
 
 int32 LstmNonlinearityComponent::OutputDim() const {
@@ -5081,8 +5081,12 @@ void LstmNonlinearityComponent::Write(std::ostream &os, bool binary) const {
       self_repair_prob.Scale(1.0 / (count_ * cell_dim));
     self_repair_prob.Write(os, binary);
   }
-  WriteToken(os, binary, "<UseDropout>");
-  WriteBasicType(os, binary, use_dropout_);
+  if (use_dropout_) {
+    // only write this if true; we have back-compat code in reading anyway.
+    // this makes the models without dropout easier to read with older code.
+    WriteToken(os, binary, "<UseDropout>");
+    WriteBasicType(os, binary, use_dropout_);
+  }
   WriteToken(os, binary, "<Count>");
   WriteBasicType(os, binary, count_);
   WriteToken(os, binary, "</LstmNonlinearityComponent>");
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index ea5df928b37..60fd1634598 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -1698,10 +1698,10 @@ class ConvolutionComponent: public UpdatableComponent {
 //    m_t = o_t * Tanh(c_t)                    (5)
 //   # note: the outputs are just c_t and m_t.
 //
-// [Note regarding dropout: optionally the input-dimension may be 5C + 2 instead
+// [Note regarding dropout: optionally the input-dimension may be 5C + 3 instead
 // of 5C in this case, the last two input dimensions will be interpreted as
-// per-frame dropout masks on i_t and f_t respectively, so that in (3), i_t is
-// replaced by i_t * i_t_scale, and likewise for f_t.
+// per-frame dropout masks on i_t, f_t and o_t respectively, so that in (3), i_t is
+// replaced by i_t * i_t_scale, and likewise for f_t and o_t.
 //
 //
 // The backprop is as you would think, but for the "self-repair" we need to pass

From 96d92d77a6ba8e21b2fc5e012f3bdd019ffbd1b0 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 31 Jan 2017 14:40:11 -0500
Subject: [PATCH 04/21] Merge remote-tracking branch 'upstream/shortcut' into
 shortcut-dropout

---
 egs/wsj/s5/steps/shift_feats.sh               |  5 ++
 .../s5/utils/data/shift_and_combine_feats.sh  | 55 ++++++++++++
 egs/wsj/s5/utils/data/shift_feats.sh          | 55 ++++++++++++
 src/featbin/shift-feats.cc                    | 89 +++++++++++++------
 4 files changed, 176 insertions(+), 28 deletions(-)
 create mode 100755 egs/wsj/s5/utils/data/shift_and_combine_feats.sh
 create mode 100755 egs/wsj/s5/utils/data/shift_feats.sh

diff --git a/egs/wsj/s5/steps/shift_feats.sh b/egs/wsj/s5/steps/shift_feats.sh
index 22b17f2cb09..ada5716f187 100755
--- a/egs/wsj/s5/steps/shift_feats.sh
+++ b/egs/wsj/s5/steps/shift_feats.sh
@@ -3,6 +3,9 @@
 # Copyright 2016    Vimal Manohar
 # Apache 2.0
 
+# This script is deprecated. The newer script utils/data/shift_feats.sh
+# should be used instead.
+
 # This script shifts the feats in the input data directory and creates a
 # new directory <input-data>_fs<num-frames-shift> with shifted feats.
 # If the shift is negative, the initial frames get truncated and the
@@ -25,6 +28,8 @@ if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -ne 4 ]; then
+   echo "This script is deprecated. The newer script utils/data/shift_feats.sh"
+   echo "should be used instead."
    echo "usage: $0 [options] <frame-shift> <src-data-dir> <log-dir> <path-to-storage-dir>";
    echo "e.g.: $0 -1 data/train exp/shift-1_train mfcc"
    echo "options: "
diff --git a/egs/wsj/s5/utils/data/shift_and_combine_feats.sh b/egs/wsj/s5/utils/data/shift_and_combine_feats.sh
new file mode 100755
index 00000000000..1a15b324ee8
--- /dev/null
+++ b/egs/wsj/s5/utils/data/shift_and_combine_feats.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# Copyright 2017  Hossein Hadian
+
+# Apache 2.0
+
+echo "$0 $@"  # Print the command line for logging
+if [ -f path.sh ]; then . ./path.sh; fi
+. utils/parse_options.sh
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <frame-subsampling-factor> <srcdir> <destdir>"
+  echo "e.g.: $0 3 data/train data/train_fs3"
+  echo "For use in perturbing data for discriminative training and alignment of"
+  echo "frame-subsampled systems, this script uses utils/data/shift_feats.sh"
+  echo "and utils/data/combine_data.sh to shift the features"
+  echo "<frame-subsampling-factor> different ways and combine them."
+  echo "E.g. if <frame-subsampling-factor> is 3, this script will combine"
+  echo "the data frame-shifted by -1, 0 and 1 (c.f. shift-feats)."
+  exit 1
+fi
+
+frame_subsampling_factor=$1
+srcdir=$2
+destdir=$3
+
+if [ ! -f $srcdir/feats.scp ]; then
+  echo "$0: expected $srcdir/feats.scp to exist"
+  exit 1
+fi
+
+if [ -f $destdir/feats.scp ]; then
+  echo "$0: $destdir/feats.scp already exists: refusing to run this (please delete $destdir/feats.scp if you want this to run)"
+  exit 1
+fi
+
+tmp_shift_destdirs=()
+for frame_shift in `seq $[-(frame_subsampling_factor/2)] $[-(frame_subsampling_factor/2) + frame_subsampling_factor - 1]`; do
+  if [ "$frame_shift" == 0 ]; then continue; fi
+  utils/data/shift_feats.sh $frame_shift $srcdir ${destdir}_fs$frame_shift || exit 1
+  tmp_shift_destdirs+=("${destdir}_fs$frame_shift")
+done
+utils/data/combine_data.sh $destdir $srcdir ${tmp_shift_destdirs[@]} || exit 1
+rm -r ${tmp_shift_destdirs[@]}
+
+utils/validate_data_dir.sh $destdir
+
+src_nf=`cat $srcdir/feats.scp | wc -l`
+dest_nf=`cat $destdir/feats.scp | wc -l`
+if [ $[src_nf*frame_subsampling_factor] -ne $dest_nf ]; then
+  echo "There was a problem. Expected number of feature lines in destination dir to be $[src_nf*frame_subsampling_factor];"
+  exit 1;
+fi
+
+echo "$0: Successfully generated $frame_subsampling_factor-way shifted version of data in $srcdir, in $destdir"
diff --git a/egs/wsj/s5/utils/data/shift_feats.sh b/egs/wsj/s5/utils/data/shift_feats.sh
new file mode 100755
index 00000000000..2ae7b2435d3
--- /dev/null
+++ b/egs/wsj/s5/utils/data/shift_feats.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# Copyright 2016    Vimal Manohar
+#           2017    Hossein Hadian
+# Apache 2.0
+
+echo "$0 $@"  # Print the command line for logging
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo " Usage: $0 <frame-shift> <srcdir> <destdir>"
+  echo "e.g.: $0 -1 data/train data/train_fs-1"
+  echo "The script creates a new data directory with the features modified"
+  echo "using the program shift-feats with the specified frame-shift."
+  echo "This program automatically adds the prefix 'fs<frame-shift>-' to the"
+  echo "utterance and speaker names. See also utils/data/shift_and_combine_feats.sh"
+  exit 1
+fi
+
+frame_shift=$1
+srcdir=$2
+destdir=$3
+
+
+if [ "$destdir" == "$srcdir" ]; then
+  echo "$0: this script requires <srcdir> and <destdir> to be different."
+  exit 1
+fi
+
+if [ ! -f $srcdir/feats.scp ]; then
+  echo "$0: no such file $srcdir/feats.scp"
+  exit 1;
+fi
+
+utt_prefix="fs$frame_shift-"
+spk_prefix="fs$frame_shift-"
+
+mkdir -p $destdir
+utils/copy_data_dir.sh --utt-prefix $utt_prefix --spk-prefix $spk_prefix \
+  $srcdir $destdir
+
+if grep --quiet "'" $srcdir/feats.scp; then
+  echo "$0: the input features already use single quotes. Can't proceed."
+  exit 1;
+fi
+
+awk -v shift=$frame_shift 'NF == 2 {uttid=$1; feat=$2; qt="";} \
+NF > 2 {idx=index($0, " "); uttid=$1; feat=substr($0, idx + 1); qt="\x27";} \
+NF {print uttid " shift-feats --print-args=false --shift=" shift, qt feat qt " - |";}' \
+  $destdir/feats.scp >$destdir/feats_shifted.scp
+mv -f $destdir/feats_shifted.scp $destdir/feats.scp
+
+echo "$0: Done"
+
diff --git a/src/featbin/shift-feats.cc b/src/featbin/shift-feats.cc
index 7b970e92248..5d392c9d15a 100644
--- a/src/featbin/shift-feats.cc
+++ b/src/featbin/shift-feats.cc
@@ -22,20 +22,41 @@
 #include "util/common-utils.h"
 #include "matrix/kaldi-matrix.h"
 
+namespace kaldi {
+  void ShiftFeatureMatrix(const Matrix<BaseFloat> &src, int32 shift,
+                          Matrix<BaseFloat>* rearranged) {
+    for (int32 r = 0; r < src.NumRows(); r++) {
+      int32 src_r = r - shift;
+      if (src_r < 0) src_r = 0;
+      if (src_r >= src.NumRows()) src_r = src.NumRows() - 1;
+      rearranged->Row(r).CopyFromVec(src.Row(src_r));
+    }
+  }
+}
 
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
 
     const char *usage =
-        "Copy features and possibly shift them in time while maintaining the length, e.g.\n"
-        "shift-feats --shift=1 <input-feats> <output-feats> will shift all frames to the\n"
-        "right by one (the first frame would be duplicated).\n"
-        "See also: copy-feats, copy-matrix\n";
+        "Copy features, and possibly shift them while maintaining the "
+        "num-frames.\n"
+        "Usage: shift-feats [options] <feature-rspecifier> "
+        "<feature-wspecifier>\n"
+        "or:  shift-feats [options] <feats-rxfilename> <feats-wxfilename>\n"
+        "e.g.: shift-feats --shift=-1 foo.scp bar.ark\n"
+        "or: shift-feats --shift=1 foo.mat bar.mat\n"
+        "See also: copy-feats, copy-matrix, select-feats, extract-rows,\n"
+        "subset-feats, subsample-feats, splice-feats, paste-feats, "
+        "concat-feats\n";
 
     ParseOptions po(usage);
+    bool binary = true;
     int32 shift = 0;
-    po.Register("shift", &shift, "Number of frames by which to shift the features.");
+    po.Register("shift", &shift, "Number of frames by which to shift the "
+                                 "features.");
+    po.Register("binary", &binary, "Binary-mode output (not relevant if "
+                "writing to archive)");
 
     po.Read(argc, argv);
 
@@ -46,32 +67,40 @@ int main(int argc, char *argv[]) {
 
     int32 num_done = 0, num_err = 0;
 
-    SequentialBaseFloatMatrixReader feat_reader(po.GetArg(1));
-    BaseFloatMatrixWriter feat_writer(po.GetArg(2));
-
-
-    for (; !feat_reader.Done(); feat_reader.Next()) {
-      const std::string &key = feat_reader.Key();
-      const Matrix<BaseFloat> &src = feat_reader.Value();
-      if (src.NumRows() == 0) {
-        KALDI_WARN << "Empty matrix for key " << key;
-        num_err++;
-        continue;
+    if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) {
+      SequentialBaseFloatMatrixReader feat_reader(po.GetArg(1));
+      BaseFloatMatrixWriter feat_writer(po.GetArg(2));
+
+
+      for (; !feat_reader.Done(); feat_reader.Next()) {
+        const std::string &key = feat_reader.Key();
+        const Matrix<BaseFloat> &src = feat_reader.Value();
+        if (src.NumRows() == 0) {
+          KALDI_WARN << "Empty matrix for key " << key;
+          num_err++;
+          continue;
+        }
+        Matrix<BaseFloat> rearranged(src.NumRows(), src.NumCols());
+        ShiftFeatureMatrix(src, shift, &rearranged);
+        feat_writer.Write(key, rearranged);
+        num_done++;
       }
+
+      KALDI_LOG << "Shifted " << num_done << " features by "
+                << shift << " frames; " << num_err << " with errors.";
+      return (num_done > 0 ? 0 : 1);
+    } else {
+      std::string feat_rxfilename = po.GetArg(1),
+                  feat_wxfilename = po.GetArg(2);
+      Matrix<BaseFloat> src;
+      ReadKaldiObject(feat_rxfilename, &src);
+      if (src.NumRows() == 0)
+        KALDI_ERR << "Empty input matrix";
       Matrix<BaseFloat> rearranged(src.NumRows(), src.NumCols());
-      for (int32 r = 0; r < src.NumRows(); r++) {
-        int32 src_r = r - shift;
-        if (src_r < 0) src_r = 0;
-        if (src_r >= src.NumRows()) src_r = src.NumRows() - 1;
-        rearranged.Row(r).CopyFromVec(src.Row(src_r));
-      }
-      feat_writer.Write(key, rearranged);
-      num_done++;
+      ShiftFeatureMatrix(src, shift, &rearranged);
+      WriteKaldiObject(rearranged, feat_wxfilename, binary);
+      // we do not print any log messages here
     }
-
-    KALDI_LOG << "Shifted " << num_done << " features by "
-              << shift << " frames; " << num_err << " with errors.";
-    return (num_done > 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;
@@ -87,4 +116,8 @@ int main(int argc, char *argv[]) {
   1 1
   1 1
   2 2 ]
+
+
+  echo "[ 1 1; 2 2; 3 3 ]" | ./shift-feats --print-args=false --binary=false \
+    --shift=1 - -
 */

From 6582acf773debc16dba2a67cebf3f6ddaae74100 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 2 Feb 2017 20:32:29 -0500
Subject: [PATCH 05/21] [scripts] Update example scripts for dropout on Tedlium
 s5_r2

---
 .../local/chain/tuning/run_tdnn_lstm_1p.sh    |   4 +
 .../local/chain/tuning/run_tdnn_lstm_1s.sh    | 383 +++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1t.sh    | 382 +++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1u.sh    | 385 ++++++++++++++++++
 4 files changed, 1154 insertions(+)
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
 create mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh

diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh
index f06f4a7f6ec..eecc6bc2544 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
 
+# [note: this was later run as 1p2, with code and script changes that
+#  meant it was using dropout on 3 gates, as Gaofeng was really doing,
+# not 2 as I thought he was doing.]
+
 # 1p is as 1k, but [via script changes] doing the dropout as Gaofeng
 # did it in the non-fast LSTMs, with separate per-frame masks on
 # the i and f component.  Using dropout schedule that maxes out at
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
new file mode 100755
index 00000000000..a9fa14ae132
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
@@ -0,0 +1,383 @@
+#!/bin/bash
+
+# 1s is as 1p, but reverting to the non-fast LSTM code (still with dropout);
+# will do 1t as the baseline without dropout.  [note: mistakenly, this was run
+# with not-per-frame dropout].
+# Results are not that encouraging.  It's just slightly better than 1t.
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi exp/chain_cleaned/tdnn_lstm1s_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1p2_sp_bi tdnn_lstm1s_sp_bi tdnn_lstm1t_sp_bi
+# WER on dev(orig)            9.0       8.7       9.1       9.2
+#         [looped:]           9.0       8.7       9.1       9.2
+# WER on dev(rescored)        8.4       8.2       8.3       8.6
+#         [looped:]           8.4       8.2       8.3       8.6
+# WER on test(orig)           8.8       8.8       9.0       9.1
+#         [looped:]           8.8       8.8       9.0       9.0
+# WER on test(rescored)       8.4       8.3       8.4       8.6
+#         [looped:]           8.3       8.3       8.4       8.7
+# Final train prob        -0.0648   -0.0717   -0.0693   -0.0618
+# Final valid prob        -0.0827   -0.0833   -0.0859   -0.0794
+# Final train prob (xent)   -0.8372   -0.8979   -0.8802   -0.8120
+# Final valid prob (xent)   -0.9497   -0.9844   -0.9934   -0.9396
+
+# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng
+# did it in the non-fast LSTMs, with separate per-frame masks on
+# the i and f component.  Using dropout schedule that maxes out at
+# 0.3, which he found worked best for that type of dropout.
+
+# [See about 20 lines below for the original comparison with the baseline,
+#  done when "p" was dropping out 2 gates [the i and f gates].
+#  The comparison directly below is between the version that dropped out
+#  2 gates (p) with the one that dropped out 3 gates (p2).  No consistent
+#  difference there.]
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi
+#_sp_bi
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi
+# System                tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi
+# WER on dev(orig)            8.9       8.7
+# WER on dev(rescored)        8.4       8.2
+# WER on test(orig)           8.7       8.8
+# WER on test(rescored)       8.1       8.3
+# Final train prob        -0.0712   -0.0717
+# Final valid prob        -0.0848   -0.0834
+# Final train prob (xent)   -0.8903   -0.9147
+# Final valid prob (xent)   -0.9719   -0.9977
+
+#
+#
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi
+# WER on dev(orig)            9.0       8.7       8.9       9.1
+#         [looped:]           9.0       8.6       8.8       9.0
+# WER on dev(rescored)        8.4       7.9       8.4       8.3
+#         [looped:]           8.4       7.8       8.3       8.2
+# WER on test(orig)           8.8       8.8       8.7       8.9
+#         [looped:]           8.8       8.7       8.6       8.9
+# WER on test(rescored)       8.4       8.3       8.1       8.3
+#         [looped:]           8.3       8.3       8.1       8.3
+# Final train prob        -0.0648   -0.0693   -0.0712   -0.0698
+# Final valid prob        -0.0827   -0.0854   -0.0848   -0.0875
+# Final train prob (xent)   -0.8372   -0.8848   -0.8903   -0.8721
+# Final valid prob (xent)   -0.9497   -0.9895   -0.9719   -0.9828
+#
+# 1k is as 1e, but introducing a dropout schedule.
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1s  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  # note: the value of the dropout-proportion is not important, as it's
+  # controlled by the dropout schedule; what's important is that we set it.
+  lstmp_opts="decay-time=20 dropout-proportion=0.0"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --trainer.dropout-schedule='0,0@0.20,0.3@0.5,0@0.75,0' \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir \
+    --cleanup=false
+ # --cleanup=false is temporary while debugging.
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
new file mode 100755
index 00000000000..724081a4c61
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
@@ -0,0 +1,382 @@
+#!/bin/bash
+
+# 1t is as 1s but without dropout; it could be compared to 1e.
+# 1s is as 1p, but reverting to the non-fast LSTM code (still with dropout);
+# will do 1t as the baseline without dropout.  Seems a bit worse than
+# the fast-LSTM code.
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1t_sp_bi
+# WER on dev(orig)            9.0       9.2
+#         [looped:]           9.0       9.2
+# WER on dev(rescored)        8.4       8.6
+#         [looped:]           8.4       8.6
+# WER on test(orig)           8.8       9.1
+#         [looped:]           8.8       9.0
+# WER on test(rescored)       8.4       8.6
+#         [looped:]           8.3       8.7
+# Final train prob        -0.0648   -0.0618
+# Final valid prob        -0.0827   -0.0794
+# Final train prob (xent)   -0.8372   -0.8120
+# Final valid prob (xent)   -0.9497   -0.9396
+
+# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng
+# did it in the non-fast LSTMs, with separate per-frame masks on
+# the i and f component.  Using dropout schedule that maxes out at
+# 0.3, which he found worked best for that type of dropout.
+
+# [See about 20 lines below for the original comparison with the baseline,
+#  done when "p" was dropping out 2 gates [the i and f gates].
+#  The comparison directly below is between the version that dropped out
+#  2 gates (p) with the one that dropped out 3 gates (p2).  No consistent
+#  difference there.]
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi
+#_sp_bi
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi
+# System                tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi
+# WER on dev(orig)            8.9       8.7
+# WER on dev(rescored)        8.4       8.2
+# WER on test(orig)           8.7       8.8
+# WER on test(rescored)       8.1       8.3
+# Final train prob        -0.0712   -0.0717
+# Final valid prob        -0.0848   -0.0834
+# Final train prob (xent)   -0.8903   -0.9147
+# Final valid prob (xent)   -0.9719   -0.9977
+
+#
+#
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi
+# WER on dev(orig)            9.0       8.7       8.9       9.1
+#         [looped:]           9.0       8.6       8.8       9.0
+# WER on dev(rescored)        8.4       7.9       8.4       8.3
+#         [looped:]           8.4       7.8       8.3       8.2
+# WER on test(orig)           8.8       8.8       8.7       8.9
+#         [looped:]           8.8       8.7       8.6       8.9
+# WER on test(rescored)       8.4       8.3       8.1       8.3
+#         [looped:]           8.3       8.3       8.1       8.3
+# Final train prob        -0.0648   -0.0693   -0.0712   -0.0698
+# Final valid prob        -0.0827   -0.0854   -0.0848   -0.0875
+# Final train prob (xent)   -0.8372   -0.8848   -0.8903   -0.8721
+# Final valid prob (xent)   -0.9497   -0.9895   -0.9719   -0.9828
+#
+# 1k is as 1e, but introducing a dropout schedule.
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1t  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  # note: the value of the dropout-proportion is not important, as it's
+  # controlled by the dropout schedule; what's important is that we set it.
+  lstmp_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir \
+    --cleanup=false
+ # --cleanup=false is temporary while debugging.
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
new file mode 100755
index 00000000000..eda096b487b
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
@@ -0,0 +1,385 @@
+#!/bin/bash
+
+# 1u is as 1s, but adding dropout-per-frame=true.
+# Slightly better than 1s, but the improvement versus the baseline 1t is
+# rather disappointing (only about 0.4 at most).
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi exp/chain_cleaned/tdnn_lstm1s_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi exp/chain_cleaned/tdnn_lstm1u_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1p2_sp_bi tdnn_lstm1s_sp_bi tdnn_lstm1t_sp_bi tdnn_lstm1u_sp_bi
+# WER on dev(orig)            9.0       8.7       9.1       9.2       9.0
+#         [looped:]           9.0       8.7       9.1       9.2       8.9
+# WER on dev(rescored)        8.4       8.2       8.3       8.6       8.1
+#         [looped:]           8.4       8.2       8.3       8.6       8.1
+# WER on test(orig)           8.8       8.8       9.0       9.1       8.7
+#         [looped:]           8.8       8.8       9.0       9.0       8.7
+# WER on test(rescored)       8.4       8.3       8.4       8.6       8.3
+#         [looped:]           8.3       8.3       8.4       8.7       8.3
+# Final train prob        -0.0648   -0.0717   -0.0693   -0.0618   -0.0723
+# Final valid prob        -0.0827   -0.0833   -0.0859   -0.0794   -0.0828
+# Final train prob (xent)   -0.8372   -0.8979   -0.8802   -0.8120   -0.9042
+# Final valid prob (xent)   -0.9497   -0.9844   -0.9934   -0.9396   -0.9879
+
+# 1s is as 1p, but reverting to the non-fast LSTM code (still with dropout);
+# will do 1t as the baseline without dropout.
+
+# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng
+# did it in the non-fast LSTMs, with separate per-frame masks on
+# the i and f component.  Using dropout schedule that maxes out at
+# 0.3, which he found worked best for that type of dropout.
+
+# [See about 20 lines below for the original comparison with the baseline,
+#  done when "p" was dropping out 2 gates [the i and f gates].
+#  The comparison directly below is between the version that dropped out
+#  2 gates (p) with the one that dropped out 3 gates (p2).  No consistent
+#  difference there.]
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi
+#_sp_bi
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi
+# System                tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi
+# WER on dev(orig)            8.9       8.7
+# WER on dev(rescored)        8.4       8.2
+# WER on test(orig)           8.7       8.8
+# WER on test(rescored)       8.1       8.3
+# Final train prob        -0.0712   -0.0717
+# Final valid prob        -0.0848   -0.0834
+# Final train prob (xent)   -0.8903   -0.9147
+# Final valid prob (xent)   -0.9719   -0.9977
+
+#
+#
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi
+# WER on dev(orig)            9.0       8.7       8.9       9.1
+#         [looped:]           9.0       8.6       8.8       9.0
+# WER on dev(rescored)        8.4       7.9       8.4       8.3
+#         [looped:]           8.4       7.8       8.3       8.2
+# WER on test(orig)           8.8       8.8       8.7       8.9
+#         [looped:]           8.8       8.7       8.6       8.9
+# WER on test(rescored)       8.4       8.3       8.1       8.3
+#         [looped:]           8.3       8.3       8.1       8.3
+# Final train prob        -0.0648   -0.0693   -0.0712   -0.0698
+# Final valid prob        -0.0827   -0.0854   -0.0848   -0.0875
+# Final train prob (xent)   -0.8372   -0.8848   -0.8903   -0.8721
+# Final valid prob (xent)   -0.9497   -0.9895   -0.9719   -0.9828
+#
+# 1k is as 1e, but introducing a dropout schedule.
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1u  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  # note: the value of the dropout-proportion is not important, as it's
+  # controlled by the dropout schedule; what's important is that we set it.
+  lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-per-frame=True"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --trainer.dropout-schedule='0,0@0.20,0.3@0.5,0@0.75,0' \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir \
+    --cleanup=false
+ # --cleanup=false is temporary while debugging.
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0

From eb94ffde0f2c8eb132c264bfcde4f50122e7de0c Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Tue, 21 Mar 2017 12:32:40 +0800
Subject: [PATCH 06/21] for ref

---
 .vscode/settings.json                         |   3 +
 .../chain/tuning/run_tdnn_lstm_1u_1024.sh     | 387 ++++++++++++++++++
 .../tdnn_lstm_1u_newschedule_5epoch_1024.sh   |   1 +
 3 files changed, 391 insertions(+)
 create mode 100644 .vscode/settings.json
 create mode 100644 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u_1024.sh
 create mode 100644 egs/tedlium/s5_r2/tdnn_lstm_1u_newschedule_5epoch_1024.sh

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 00000000000..fe7159848bd
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.linting.pylintEnabled": false
+}
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u_1024.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u_1024.sh
new file mode 100644
index 00000000000..e6a44bd0bc8
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u_1024.sh
@@ -0,0 +1,387 @@
+#!/bin/bash
+
+# 1u is as 1s, but adding dropout-per-frame=true.
+# Slightly better than 1s, but the improvement versus the baseline 1t is
+# rather disappointing (only about 0.4 at most).
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi exp/chain_cleaned/tdnn_lstm1s_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi exp/chain_cleaned/tdnn_lstm1u_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1p2_sp_bi tdnn_lstm1s_sp_bi tdnn_lstm1t_sp_bi tdnn_lstm1u_sp_bi
+# WER on dev(orig)            9.0       8.7       9.1       9.2       9.0
+#         [looped:]           9.0       8.7       9.1       9.2       8.9
+# WER on dev(rescored)        8.4       8.2       8.3       8.6       8.1
+#         [looped:]           8.4       8.2       8.3       8.6       8.1
+# WER on test(orig)           8.8       8.8       9.0       9.1       8.7
+#         [looped:]           8.8       8.8       9.0       9.0       8.7
+# WER on test(rescored)       8.4       8.3       8.4       8.6       8.3
+#         [looped:]           8.3       8.3       8.4       8.7       8.3
+# Final train prob        -0.0648   -0.0717   -0.0693   -0.0618   -0.0723
+# Final valid prob        -0.0827   -0.0833   -0.0859   -0.0794   -0.0828
+# Final train prob (xent)   -0.8372   -0.8979   -0.8802   -0.8120   -0.9042
+# Final valid prob (xent)   -0.9497   -0.9844   -0.9934   -0.9396   -0.9879
+
+# 1s is as 1p, but reverting to the non-fast LSTM code (still with dropout);
+# will do 1t as the baseline without dropout.
+
+# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng
+# did it in the non-fast LSTMs, with separate per-frame masks on
+# the i and f component.  Using dropout schedule that maxes out at
+# 0.3, which he found worked best for that type of dropout.
+
+# [See about 20 lines below for the original comparison with the baseline,
+#  done when "p" was dropping out 2 gates [the i and f gates].
+#  The comparison directly below is between the version that dropped out
+#  2 gates (p) with the one that dropped out 3 gates (p2).  No consistent
+#  difference there.]
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi
+#_sp_bi
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi
+# System                tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi
+# WER on dev(orig)            8.9       8.7
+# WER on dev(rescored)        8.4       8.2
+# WER on test(orig)           8.7       8.8
+# WER on test(rescored)       8.1       8.3
+# Final train prob        -0.0712   -0.0717
+# Final valid prob        -0.0848   -0.0834
+# Final train prob (xent)   -0.8903   -0.9147
+# Final valid prob (xent)   -0.9719   -0.9977
+
+#
+#
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi
+# WER on dev(orig)            9.0       8.7       8.9       9.1
+#         [looped:]           9.0       8.6       8.8       9.0
+# WER on dev(rescored)        8.4       7.9       8.4       8.3
+#         [looped:]           8.4       7.8       8.3       8.2
+# WER on test(orig)           8.8       8.8       8.7       8.9
+#         [looped:]           8.8       8.7       8.6       8.9
+# WER on test(rescored)       8.4       8.3       8.1       8.3
+#         [looped:]           8.3       8.3       8.1       8.3
+# Final train prob        -0.0648   -0.0693   -0.0712   -0.0698
+# Final valid prob        -0.0827   -0.0854   -0.0848   -0.0875
+# Final train prob (xent)   -0.8372   -0.8848   -0.8903   -0.8721
+# Final valid prob (xent)   -0.9497   -0.9895   -0.9719   -0.9828
+#
+# 1k is as 1e, but introducing a dropout schedule.
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=17
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+dropout_schedule=
+num_epoch=
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1u  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  # note: the value of the dropout-proportion is not important, as it's
+  # controlled by the dropout schedule; what's important is that we set it.
+  lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-per-frame=True"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 dim=1024 input=Append(-1,0,1)
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn3 dim=1024 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=1024 input=Append(-3,0,3)
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn5 dim=1024 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=1024 input=Append(-3,0,3)
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstmp_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --trainer.dropout-schedule="$dropout_schedule" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs $num_epoch \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir \
+    --cleanup=false
+ # --cleanup=false is temporary while debugging.
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/tdnn_lstm_1u_newschedule_5epoch_1024.sh b/egs/tedlium/s5_r2/tdnn_lstm_1u_newschedule_5epoch_1024.sh
new file mode 100644
index 00000000000..d41fb4f82c2
--- /dev/null
+++ b/egs/tedlium/s5_r2/tdnn_lstm_1u_newschedule_5epoch_1024.sh
@@ -0,0 +1 @@
+local/chain/tuning/run_tdnn_lstm_1u_1024.sh --train-stage 68 --dropout-schedule "0,0@0.20,0.3@0.5,0" --num-epoch 5 --tdnn-lstm-affix 1u_newschedule_5epoch_1024

From 9afaf399ce353540839f9b7ebbde172eb8c29367 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Sun, 9 Apr 2017 10:36:28 +0800
Subject: [PATCH 07/21] delete temporary tuning sdripts in tedlium

---
 .../local/chain/tuning/run_tdnn_lstm_1p.sh    | 367 -----------------
 .../local/chain/tuning/run_tdnn_lstm_1q.sh    | 348 ----------------
 .../local/chain/tuning/run_tdnn_lstm_1s.sh    | 383 -----------------
 .../local/chain/tuning/run_tdnn_lstm_1t.sh    | 382 -----------------
 .../local/chain/tuning/run_tdnn_lstm_1u.sh    | 385 -----------------
 .../chain/tuning/run_tdnn_lstm_1u_1024.sh     | 387 ------------------
 .../tdnn_lstm_1u_newschedule_5epoch_1024.sh   |   1 -
 7 files changed, 2253 deletions(-)
 delete mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh
 delete mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1q.sh
 delete mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
 delete mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
 delete mode 100755 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
 delete mode 100644 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u_1024.sh
 delete mode 100644 egs/tedlium/s5_r2/tdnn_lstm_1u_newschedule_5epoch_1024.sh

diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh
deleted file mode 100755
index eecc6bc2544..00000000000
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1p.sh
+++ /dev/null
@@ -1,367 +0,0 @@
-#!/bin/bash
-
-# [note: this was later run as 1p2, with code and script changes that
-#  meant it was using dropout on 3 gates, as Gaofeng was really doing,
-# not 2 as I thought he was doing.]
-
-# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng
-# did it in the non-fast LSTMs, with separate per-frame masks on
-# the i and f component.  Using dropout schedule that maxes out at
-# 0.3, which he found worked best for that type of dropout.
-
-# [See about 20 lines below for the original comparison with the baseline,
-#  done when "p" was dropping out 2 gates [the i and f gates].
-#  The comparison directly below is between the version that dropped out
-#  2 gates (p) with the one that dropped out 3 gates (p2).  No consistent
-#  difference there.]
-# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi
-#_sp_bi
-# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi
-# System                tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi
-# WER on dev(orig)            8.9       8.7
-# WER on dev(rescored)        8.4       8.2
-# WER on test(orig)           8.7       8.8
-# WER on test(rescored)       8.1       8.3
-# Final train prob        -0.0712   -0.0717
-# Final valid prob        -0.0848   -0.0834
-# Final train prob (xent)   -0.8903   -0.9147
-# Final valid prob (xent)   -0.9719   -0.9977
-
-#
-#
-# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi
-# System                tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi
-# WER on dev(orig)            9.0       8.7       8.9       9.1
-#         [looped:]           9.0       8.6       8.8       9.0
-# WER on dev(rescored)        8.4       7.9       8.4       8.3
-#         [looped:]           8.4       7.8       8.3       8.2
-# WER on test(orig)           8.8       8.8       8.7       8.9
-#         [looped:]           8.8       8.7       8.6       8.9
-# WER on test(rescored)       8.4       8.3       8.1       8.3
-#         [looped:]           8.3       8.3       8.1       8.3
-# Final train prob        -0.0648   -0.0693   -0.0712   -0.0698
-# Final valid prob        -0.0827   -0.0854   -0.0848   -0.0875
-# Final train prob (xent)   -0.8372   -0.8848   -0.8903   -0.8721
-# Final valid prob (xent)   -0.9497   -0.9895   -0.9719   -0.9828
-#
-# 1k is as 1e, but introducing a dropout schedule.
-
-# 1e is as 1b, but reducing decay-time from 40 to 20.
-
-# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
-# uses egs from 1b, remember to remove that before I commit.
-
-# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
-# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
-
-# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
-# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
-# better results.  Note: these results are not with the updated LM (the LM data-prep
-# for this setup was changed in Nov 2016 but this was with an older directory).
-#
-# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
-# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
-# WER on dev(orig)          10.3      10.7       9.7
-# WER on dev(rescored)       9.8      10.1       9.3
-# WER on test(orig)           9.7       9.8       9.1
-# WER on test(rescored)       9.2       9.4       8.7
-# Final train prob        -0.0812   -0.0862   -0.0625
-# Final valid prob        -0.1049   -0.1047   -0.0910
-# Final train prob (xent)   -1.1334   -1.1763   -0.8518
-# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
-
-## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
-## otherwise call it directly in its location).
-# by default, with cleanup:
-# local/chain/run_tdnn_lstm.sh
-
-# without cleanup:
-# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
-
-# note, if you have already run one of the non-chain nnet3 systems
-# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
-
-# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
-# standard, LSTM, except that some TDNN layers were added in between the
-# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
-# this isn't exactly copied from there.
-
-
-set -e -o pipefail
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=0
-nj=30
-decode_nj=30
-min_seg_len=1.55
-label_delay=5
-xent_regularize=0.1
-train_set=train_cleaned
-gmm=tri3_cleaned  # the gmm for the target data
-num_threads_ubm=32
-nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-# training options
-chunk_left_context=40
-chunk_right_context=0
-chunk_left_context_initial=0
-chunk_right_context_final=0
-# decode options
-extra_left_context=50
-extra_right_context=0
-extra_left_context_initial=0
-extra_right_context_final=0
-frames_per_chunk=140,100,160
-frames_per_chunk_primary=140
-
-# The rest are configs specific to this script.  Most of the parameters
-# are just hardcoded at this level, in the commands below.
-train_stage=-10
-tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_lstm_affix=1p2  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                             --nnet3-affix "$nnet3_affix"
-
-
-gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
-train_data_dir=data/${train_set}_sp_hires_comb
-lores_train_data_dir=data/${train_set}_sp_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
-
-
-for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-if [ $stage -le 14 ]; then
-  echo "$0: creating lang directory with one state per phone."
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  if [ -d data/lang_chain ]; then
-    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
-      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
-    else
-      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
-      echo " ... not sure what to do.  Exiting."
-      exit 1;
-    fi
-  else
-    cp -r data/lang data/lang_chain
-    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
-    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
-    # Use our special topology... note that later on may have to tune this
-    # topology.
-    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
-  fi
-fi
-
-if [ $stage -le 15 ]; then
-  # Get the alignments as lattices (gives the chain training more freedom).
-  # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang $gmm_dir $lat_dir
-  rm $lat_dir/fsts.*.gz # save space
-fi
-
-if [ $stage -le 16 ]; then
-  # Build a tree using our new topology.  We know we have alignments for the
-  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
-  # those.
-  if [ -f $tree_dir/final.mdl ]; then
-    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
-    exit 1;
-  fi
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --leftmost-questions-truncate -1 \
-      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
-fi
-
-
-if [ $stage -le 17 ]; then
-  mkdir -p $dir
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-
-  # note: the value of the dropout-proportion is not important, as it's
-  # controlled by the dropout schedule; what's important is that we set it.
-  lstmp_opts="decay-time=20 dropout-proportion=0.0"
-
-  mkdir -p $dir/configs
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-renorm-layer name=tdnn1 dim=512
-  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
-  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
-  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
-  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
-  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
-  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
-  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
-  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
-
-  ## adding the layers for chain branch
-  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
-
-  # adding the layers for xent branch
-  # This block prints the configs for a separate output that will be
-  # trained with a cross-entropy objective in the 'chain' models... this
-  # has the effect of regularizing the hidden parts of the model.  we use
-  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
-  # 0.5 / args.xent_regularize is suitable as it means the xent
-  # final-layer learns at a rate independent of the regularization
-  # constant; and the 0.5 was tuned so as to make the relative progress
-  # similar in the xent and regular final layers.
-  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
-
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-
-if [ $stage -le 18 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
- steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "$decode_cmd" \
-    --feat.online-ivector-dir $train_ivector_dir \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --trainer.dropout-schedule='0,0@0.20,0.3@0.5,0@0.75,0' \
-    --chain.xent-regularize 0.1 \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.00005 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width "$frames_per_chunk" \
-    --egs.chunk-left-context "$chunk_left_context" \
-    --egs.chunk-right-context "$chunk_right_context" \
-    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
-    --egs.chunk-right-context-final "$chunk_right_context_final" \
-    --trainer.num-chunk-per-minibatch 128,64 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.max-param-change 2.0 \
-    --trainer.num-epochs 4 \
-    --trainer.deriv-truncate-margin 10 \
-    --trainer.optimization.shrink-value 0.99 \
-    --trainer.optimization.num-jobs-initial 2 \
-    --trainer.optimization.num-jobs-final 12 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.optimization.momentum 0.0 \
-    --cleanup.remove-egs true \
-    --feat-dir $train_data_dir \
-    --tree-dir $tree_dir \
-    --lat-dir $lat_dir \
-    --dir $dir \
-    --cleanup=false
- # --cleanup=false is temporary while debugging.
-fi
-
-
-
-if [ $stage -le 19 ]; then
-  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
-fi
-
-if [ $stage -le 20 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for dset in dev test; do
-      (
-      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context $extra_left_context  \
-          --extra-right-context $extra_right_context  \
-          --extra-left-context-initial $extra_left_context_initial \
-          --extra-right-context-final $extra_right_context_final \
-          --frames-per-chunk "$frames_per_chunk_primary" \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-          --scoring-opts "--min-lmwt 5 " \
-         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-if [ $stage -le 21 ]; then
-  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
-  # so it will take a bit longer as the --num-threads option is not supported.
-  # we just hardcode the --frames-per-chunk option as it doesn't have to
-  # match any value used in training, and it won't affect the results (unlike
-  # regular decoding).
-  rm $dir/.error 2>/dev/null || true
-  for dset in dev test; do
-      (
-      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context-initial $extra_left_context_initial \
-          --frames-per-chunk 30 \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-          --scoring-opts "--min-lmwt 5 " \
-         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1q.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1q.sh
deleted file mode 100755
index f6a640fe17f..00000000000
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1q.sh
+++ /dev/null
@@ -1,348 +0,0 @@
-#!/bin/bash
-
-# 1q is as 1p, but add the "dropout-exclusive" option which means that
-# never drops out *both* the i and f gates.
-# not helpful.  see run_tdnn_lstm_1p.sh for results.
-
-# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng
-# did it in the non-fast LSTMs, with separate per-frame masks on
-# the i and f component.  Using dropout schedule that maxes out at
-# 0.3, which he found worked best for that type of dropout.
-#
-# 1k is as 1e, but introducing a dropout schedule.
-
-# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,l,m}_sp_bi
-# System                tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1l_sp_bi tdnn_lstm1m_sp_bi
-# WER on dev(orig)            9.0       8.7       8.9       9.0
-#         [looped:]           9.0       8.6       8.9       8.9
-# WER on dev(rescored)        8.4       7.9       8.2       8.2
-#         [looped:]           8.4       7.8       8.2       8.3
-# WER on test(orig)           8.8       8.8       8.9       8.9
-#         [looped:]           8.8       8.7       8.8       8.8
-# WER on test(rescored)       8.4       8.3       8.2       8.5
-#         [looped:]           8.3       8.3       8.3       8.4
-# Final train prob        -0.0648   -0.0693   -0.0768   -0.0807
-# Final valid prob        -0.0827   -0.0854   -0.0943   -0.0931
-# Final train prob (xent)   -0.8372   -0.8848   -0.9371   -0.9807
-# Final valid prob (xent)   -0.9497   -0.9895   -1.0546   -1.0629
-
-
-# 1e is as 1b, but reducing decay-time from 40 to 20.
-
-# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
-# uses egs from 1b, remember to remove that before I commit.
-
-# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
-# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
-
-# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
-# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
-# better results.  Note: these results are not with the updated LM (the LM data-prep
-# for this setup was changed in Nov 2016 but this was with an older directory).
-#
-# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
-# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
-# WER on dev(orig)          10.3      10.7       9.7
-# WER on dev(rescored)       9.8      10.1       9.3
-# WER on test(orig)           9.7       9.8       9.1
-# WER on test(rescored)       9.2       9.4       8.7
-# Final train prob        -0.0812   -0.0862   -0.0625
-# Final valid prob        -0.1049   -0.1047   -0.0910
-# Final train prob (xent)   -1.1334   -1.1763   -0.8518
-# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
-
-## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
-## otherwise call it directly in its location).
-# by default, with cleanup:
-# local/chain/run_tdnn_lstm.sh
-
-# without cleanup:
-# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
-
-# note, if you have already run one of the non-chain nnet3 systems
-# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
-
-# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
-# standard, LSTM, except that some TDNN layers were added in between the
-# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
-# this isn't exactly copied from there.
-
-
-set -e -o pipefail
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=0
-nj=30
-decode_nj=30
-min_seg_len=1.55
-label_delay=5
-xent_regularize=0.1
-train_set=train_cleaned
-gmm=tri3_cleaned  # the gmm for the target data
-num_threads_ubm=32
-nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-# training options
-chunk_left_context=40
-chunk_right_context=0
-chunk_left_context_initial=0
-chunk_right_context_final=0
-# decode options
-extra_left_context=50
-extra_right_context=0
-extra_left_context_initial=0
-extra_right_context_final=0
-frames_per_chunk=140,100,160
-frames_per_chunk_primary=140
-
-# The rest are configs specific to this script.  Most of the parameters
-# are just hardcoded at this level, in the commands below.
-train_stage=-10
-tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_lstm_affix=1q  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                             --nnet3-affix "$nnet3_affix"
-
-
-gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
-train_data_dir=data/${train_set}_sp_hires_comb
-lores_train_data_dir=data/${train_set}_sp_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
-
-
-for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-if [ $stage -le 14 ]; then
-  echo "$0: creating lang directory with one state per phone."
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  if [ -d data/lang_chain ]; then
-    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
-      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
-    else
-      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
-      echo " ... not sure what to do.  Exiting."
-      exit 1;
-    fi
-  else
-    cp -r data/lang data/lang_chain
-    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
-    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
-    # Use our special topology... note that later on may have to tune this
-    # topology.
-    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
-  fi
-fi
-
-if [ $stage -le 15 ]; then
-  # Get the alignments as lattices (gives the chain training more freedom).
-  # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang $gmm_dir $lat_dir
-  rm $lat_dir/fsts.*.gz # save space
-fi
-
-if [ $stage -le 16 ]; then
-  # Build a tree using our new topology.  We know we have alignments for the
-  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
-  # those.
-  if [ -f $tree_dir/final.mdl ]; then
-    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
-    exit 1;
-  fi
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --leftmost-questions-truncate -1 \
-      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
-fi
-
-
-if [ $stage -le 17 ]; then
-  mkdir -p $dir
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-
-  # note: the value of the dropout-proportion is not important, as it's
-  # controlled by the dropout schedule; what's important is that we set it.
-  lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-exclusive=true"
-
-  mkdir -p $dir/configs
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-renorm-layer name=tdnn1 dim=512
-  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
-  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
-  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
-  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
-  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
-  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
-  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
-  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
-
-  ## adding the layers for chain branch
-  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
-
-  # adding the layers for xent branch
-  # This block prints the configs for a separate output that will be
-  # trained with a cross-entropy objective in the 'chain' models... this
-  # has the effect of regularizing the hidden parts of the model.  we use
-  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
-  # 0.5 / args.xent_regularize is suitable as it means the xent
-  # final-layer learns at a rate independent of the regularization
-  # constant; and the 0.5 was tuned so as to make the relative progress
-  # similar in the xent and regular final layers.
-  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
-
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-
-if [ $stage -le 18 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
- steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "$decode_cmd" \
-    --feat.online-ivector-dir $train_ivector_dir \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --trainer.dropout-schedule='0,0@0.20,0.3@0.5,0@0.75,0' \
-    --chain.xent-regularize 0.1 \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.00005 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width "$frames_per_chunk" \
-    --egs.chunk-left-context "$chunk_left_context" \
-    --egs.chunk-right-context "$chunk_right_context" \
-    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
-    --egs.chunk-right-context-final "$chunk_right_context_final" \
-    --trainer.num-chunk-per-minibatch 128,64 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.max-param-change 2.0 \
-    --trainer.num-epochs 4 \
-    --trainer.deriv-truncate-margin 10 \
-    --trainer.optimization.shrink-value 0.99 \
-    --trainer.optimization.num-jobs-initial 2 \
-    --trainer.optimization.num-jobs-final 12 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.optimization.momentum 0.0 \
-    --cleanup.remove-egs true \
-    --feat-dir $train_data_dir \
-    --tree-dir $tree_dir \
-    --lat-dir $lat_dir \
-    --dir $dir \
-    --cleanup=false
- # --cleanup=false is temporary while debugging.
-fi
-
-
-
-if [ $stage -le 19 ]; then
-  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
-fi
-
-if [ $stage -le 20 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for dset in dev test; do
-      (
-      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context $extra_left_context  \
-          --extra-right-context $extra_right_context  \
-          --extra-left-context-initial $extra_left_context_initial \
-          --extra-right-context-final $extra_right_context_final \
-          --frames-per-chunk "$frames_per_chunk_primary" \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-          --scoring-opts "--min-lmwt 5 " \
-         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-if [ $stage -le 21 ]; then
-  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
-  # so it will take a bit longer as the --num-threads option is not supported.
-  # we just hardcode the --frames-per-chunk option as it doesn't have to
-  # match any value used in training, and it won't affect the results (unlike
-  # regular decoding).
-  rm $dir/.error 2>/dev/null || true
-  for dset in dev test; do
-      (
-      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context-initial $extra_left_context_initial \
-          --frames-per-chunk 30 \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-          --scoring-opts "--min-lmwt 5 " \
-         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
deleted file mode 100755
index a9fa14ae132..00000000000
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
+++ /dev/null
@@ -1,383 +0,0 @@
-#!/bin/bash
-
-# 1s is as 1p, but reverting to the non-fast LSTM code (still with dropout);
-# will do 1t as the baseline without dropout.  [note: mistakenly, this was run
-# with not-per-frame dropout].
-# Results are not that encouraging.  It's just slightly better than 1t.
-
-# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi exp/chain_cleaned/tdnn_lstm1s_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi
-# System                tdnn_lstm1e_sp_bi tdnn_lstm1p2_sp_bi tdnn_lstm1s_sp_bi tdnn_lstm1t_sp_bi
-# WER on dev(orig)            9.0       8.7       9.1       9.2
-#         [looped:]           9.0       8.7       9.1       9.2
-# WER on dev(rescored)        8.4       8.2       8.3       8.6
-#         [looped:]           8.4       8.2       8.3       8.6
-# WER on test(orig)           8.8       8.8       9.0       9.1
-#         [looped:]           8.8       8.8       9.0       9.0
-# WER on test(rescored)       8.4       8.3       8.4       8.6
-#         [looped:]           8.3       8.3       8.4       8.7
-# Final train prob        -0.0648   -0.0717   -0.0693   -0.0618
-# Final valid prob        -0.0827   -0.0833   -0.0859   -0.0794
-# Final train prob (xent)   -0.8372   -0.8979   -0.8802   -0.8120
-# Final valid prob (xent)   -0.9497   -0.9844   -0.9934   -0.9396
-
-# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng
-# did it in the non-fast LSTMs, with separate per-frame masks on
-# the i and f component.  Using dropout schedule that maxes out at
-# 0.3, which he found worked best for that type of dropout.
-
-# [See about 20 lines below for the original comparison with the baseline,
-#  done when "p" was dropping out 2 gates [the i and f gates].
-#  The comparison directly below is between the version that dropped out
-#  2 gates (p) with the one that dropped out 3 gates (p2).  No consistent
-#  difference there.]
-# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi
-#_sp_bi
-# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi
-# System                tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi
-# WER on dev(orig)            8.9       8.7
-# WER on dev(rescored)        8.4       8.2
-# WER on test(orig)           8.7       8.8
-# WER on test(rescored)       8.1       8.3
-# Final train prob        -0.0712   -0.0717
-# Final valid prob        -0.0848   -0.0834
-# Final train prob (xent)   -0.8903   -0.9147
-# Final valid prob (xent)   -0.9719   -0.9977
-
-#
-#
-# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi
-# System                tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi
-# WER on dev(orig)            9.0       8.7       8.9       9.1
-#         [looped:]           9.0       8.6       8.8       9.0
-# WER on dev(rescored)        8.4       7.9       8.4       8.3
-#         [looped:]           8.4       7.8       8.3       8.2
-# WER on test(orig)           8.8       8.8       8.7       8.9
-#         [looped:]           8.8       8.7       8.6       8.9
-# WER on test(rescored)       8.4       8.3       8.1       8.3
-#         [looped:]           8.3       8.3       8.1       8.3
-# Final train prob        -0.0648   -0.0693   -0.0712   -0.0698
-# Final valid prob        -0.0827   -0.0854   -0.0848   -0.0875
-# Final train prob (xent)   -0.8372   -0.8848   -0.8903   -0.8721
-# Final valid prob (xent)   -0.9497   -0.9895   -0.9719   -0.9828
-#
-# 1k is as 1e, but introducing a dropout schedule.
-
-# 1e is as 1b, but reducing decay-time from 40 to 20.
-
-# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
-# uses egs from 1b, remember to remove that before I commit.
-
-# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
-# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
-
-# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
-# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
-# better results.  Note: these results are not with the updated LM (the LM data-prep
-# for this setup was changed in Nov 2016 but this was with an older directory).
-#
-# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
-# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
-# WER on dev(orig)          10.3      10.7       9.7
-# WER on dev(rescored)       9.8      10.1       9.3
-# WER on test(orig)           9.7       9.8       9.1
-# WER on test(rescored)       9.2       9.4       8.7
-# Final train prob        -0.0812   -0.0862   -0.0625
-# Final valid prob        -0.1049   -0.1047   -0.0910
-# Final train prob (xent)   -1.1334   -1.1763   -0.8518
-# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
-
-## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
-## otherwise call it directly in its location).
-# by default, with cleanup:
-# local/chain/run_tdnn_lstm.sh
-
-# without cleanup:
-# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
-
-# note, if you have already run one of the non-chain nnet3 systems
-# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
-
-# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
-# standard, LSTM, except that some TDNN layers were added in between the
-# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
-# this isn't exactly copied from there.
-
-
-set -e -o pipefail
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=0
-nj=30
-decode_nj=30
-min_seg_len=1.55
-label_delay=5
-xent_regularize=0.1
-train_set=train_cleaned
-gmm=tri3_cleaned  # the gmm for the target data
-num_threads_ubm=32
-nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-# training options
-chunk_left_context=40
-chunk_right_context=0
-chunk_left_context_initial=0
-chunk_right_context_final=0
-# decode options
-extra_left_context=50
-extra_right_context=0
-extra_left_context_initial=0
-extra_right_context_final=0
-frames_per_chunk=140,100,160
-frames_per_chunk_primary=140
-
-# The rest are configs specific to this script.  Most of the parameters
-# are just hardcoded at this level, in the commands below.
-train_stage=-10
-tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_lstm_affix=1s  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                             --nnet3-affix "$nnet3_affix"
-
-
-gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
-train_data_dir=data/${train_set}_sp_hires_comb
-lores_train_data_dir=data/${train_set}_sp_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
-
-
-for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-if [ $stage -le 14 ]; then
-  echo "$0: creating lang directory with one state per phone."
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  if [ -d data/lang_chain ]; then
-    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
-      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
-    else
-      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
-      echo " ... not sure what to do.  Exiting."
-      exit 1;
-    fi
-  else
-    cp -r data/lang data/lang_chain
-    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
-    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
-    # Use our special topology... note that later on may have to tune this
-    # topology.
-    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
-  fi
-fi
-
-if [ $stage -le 15 ]; then
-  # Get the alignments as lattices (gives the chain training more freedom).
-  # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang $gmm_dir $lat_dir
-  rm $lat_dir/fsts.*.gz # save space
-fi
-
-if [ $stage -le 16 ]; then
-  # Build a tree using our new topology.  We know we have alignments for the
-  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
-  # those.
-  if [ -f $tree_dir/final.mdl ]; then
-    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
-    exit 1;
-  fi
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --leftmost-questions-truncate -1 \
-      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
-fi
-
-
-if [ $stage -le 17 ]; then
-  mkdir -p $dir
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-
-  # note: the value of the dropout-proportion is not important, as it's
-  # controlled by the dropout schedule; what's important is that we set it.
-  lstmp_opts="decay-time=20 dropout-proportion=0.0"
-
-  mkdir -p $dir/configs
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-renorm-layer name=tdnn1 dim=512
-  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
-  lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
-  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
-  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
-  lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
-  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
-  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
-  lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
-
-  ## adding the layers for chain branch
-  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
-
-  # adding the layers for xent branch
-  # This block prints the configs for a separate output that will be
-  # trained with a cross-entropy objective in the 'chain' models... this
-  # has the effect of regularizing the hidden parts of the model.  we use
-  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
-  # 0.5 / args.xent_regularize is suitable as it means the xent
-  # final-layer learns at a rate independent of the regularization
-  # constant; and the 0.5 was tuned so as to make the relative progress
-  # similar in the xent and regular final layers.
-  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
-
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-
-if [ $stage -le 18 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
- steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "$decode_cmd" \
-    --feat.online-ivector-dir $train_ivector_dir \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --trainer.dropout-schedule='0,0@0.20,0.3@0.5,0@0.75,0' \
-    --chain.xent-regularize 0.1 \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.00005 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width "$frames_per_chunk" \
-    --egs.chunk-left-context "$chunk_left_context" \
-    --egs.chunk-right-context "$chunk_right_context" \
-    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
-    --egs.chunk-right-context-final "$chunk_right_context_final" \
-    --trainer.num-chunk-per-minibatch 128,64 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.max-param-change 2.0 \
-    --trainer.num-epochs 4 \
-    --trainer.deriv-truncate-margin 10 \
-    --trainer.optimization.shrink-value 0.99 \
-    --trainer.optimization.num-jobs-initial 2 \
-    --trainer.optimization.num-jobs-final 12 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.optimization.momentum 0.0 \
-    --cleanup.remove-egs true \
-    --feat-dir $train_data_dir \
-    --tree-dir $tree_dir \
-    --lat-dir $lat_dir \
-    --dir $dir \
-    --cleanup=false
- # --cleanup=false is temporary while debugging.
-fi
-
-
-
-if [ $stage -le 19 ]; then
-  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
-fi
-
-if [ $stage -le 20 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for dset in dev test; do
-      (
-      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context $extra_left_context  \
-          --extra-right-context $extra_right_context  \
-          --extra-left-context-initial $extra_left_context_initial \
-          --extra-right-context-final $extra_right_context_final \
-          --frames-per-chunk "$frames_per_chunk_primary" \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-          --scoring-opts "--min-lmwt 5 " \
-         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-if [ $stage -le 21 ]; then
-  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
-  # so it will take a bit longer as the --num-threads option is not supported.
-  # we just hardcode the --frames-per-chunk option as it doesn't have to
-  # match any value used in training, and it won't affect the results (unlike
-  # regular decoding).
-  rm $dir/.error 2>/dev/null || true
-  for dset in dev test; do
-      (
-      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context-initial $extra_left_context_initial \
-          --frames-per-chunk 30 \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-          --scoring-opts "--min-lmwt 5 " \
-         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
deleted file mode 100755
index 724081a4c61..00000000000
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
+++ /dev/null
@@ -1,382 +0,0 @@
-#!/bin/bash
-
-# 1t is as 1s but without dropout; it could be compared to 1e.
-# 1s is as 1p, but reverting to the non-fast LSTM code (still with dropout);
-# will do 1t as the baseline without dropout.  Seems a bit worse than
-# the fast-LSTM code.
-
-# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi
-# System                tdnn_lstm1e_sp_bi tdnn_lstm1t_sp_bi
-# WER on dev(orig)            9.0       9.2
-#         [looped:]           9.0       9.2
-# WER on dev(rescored)        8.4       8.6
-#         [looped:]           8.4       8.6
-# WER on test(orig)           8.8       9.1
-#         [looped:]           8.8       9.0
-# WER on test(rescored)       8.4       8.6
-#         [looped:]           8.3       8.7
-# Final train prob        -0.0648   -0.0618
-# Final valid prob        -0.0827   -0.0794
-# Final train prob (xent)   -0.8372   -0.8120
-# Final valid prob (xent)   -0.9497   -0.9396
-
-# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng
-# did it in the non-fast LSTMs, with separate per-frame masks on
-# the i and f component.  Using dropout schedule that maxes out at
-# 0.3, which he found worked best for that type of dropout.
-
-# [See about 20 lines below for the original comparison with the baseline,
-#  done when "p" was dropping out 2 gates [the i and f gates].
-#  The comparison directly below is between the version that dropped out
-#  2 gates (p) with the one that dropped out 3 gates (p2).  No consistent
-#  difference there.]
-# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi
-#_sp_bi
-# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi
-# System                tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi
-# WER on dev(orig)            8.9       8.7
-# WER on dev(rescored)        8.4       8.2
-# WER on test(orig)           8.7       8.8
-# WER on test(rescored)       8.1       8.3
-# Final train prob        -0.0712   -0.0717
-# Final valid prob        -0.0848   -0.0834
-# Final train prob (xent)   -0.8903   -0.9147
-# Final valid prob (xent)   -0.9719   -0.9977
-
-#
-#
-# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi
-# System                tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi
-# WER on dev(orig)            9.0       8.7       8.9       9.1
-#         [looped:]           9.0       8.6       8.8       9.0
-# WER on dev(rescored)        8.4       7.9       8.4       8.3
-#         [looped:]           8.4       7.8       8.3       8.2
-# WER on test(orig)           8.8       8.8       8.7       8.9
-#         [looped:]           8.8       8.7       8.6       8.9
-# WER on test(rescored)       8.4       8.3       8.1       8.3
-#         [looped:]           8.3       8.3       8.1       8.3
-# Final train prob        -0.0648   -0.0693   -0.0712   -0.0698
-# Final valid prob        -0.0827   -0.0854   -0.0848   -0.0875
-# Final train prob (xent)   -0.8372   -0.8848   -0.8903   -0.8721
-# Final valid prob (xent)   -0.9497   -0.9895   -0.9719   -0.9828
-#
-# 1k is as 1e, but introducing a dropout schedule.
-
-# 1e is as 1b, but reducing decay-time from 40 to 20.
-
-# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
-# uses egs from 1b, remember to remove that before I commit.
-
-# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
-# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
-
-# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
-# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
-# better results.  Note: these results are not with the updated LM (the LM data-prep
-# for this setup was changed in Nov 2016 but this was with an older directory).
-#
-# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
-# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
-# WER on dev(orig)          10.3      10.7       9.7
-# WER on dev(rescored)       9.8      10.1       9.3
-# WER on test(orig)           9.7       9.8       9.1
-# WER on test(rescored)       9.2       9.4       8.7
-# Final train prob        -0.0812   -0.0862   -0.0625
-# Final valid prob        -0.1049   -0.1047   -0.0910
-# Final train prob (xent)   -1.1334   -1.1763   -0.8518
-# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
-
-## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
-## otherwise call it directly in its location).
-# by default, with cleanup:
-# local/chain/run_tdnn_lstm.sh
-
-# without cleanup:
-# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
-
-# note, if you have already run one of the non-chain nnet3 systems
-# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
-
-# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
-# standard, LSTM, except that some TDNN layers were added in between the
-# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
-# this isn't exactly copied from there.
-
-
-set -e -o pipefail
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=0
-nj=30
-decode_nj=30
-min_seg_len=1.55
-label_delay=5
-xent_regularize=0.1
-train_set=train_cleaned
-gmm=tri3_cleaned  # the gmm for the target data
-num_threads_ubm=32
-nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-# training options
-chunk_left_context=40
-chunk_right_context=0
-chunk_left_context_initial=0
-chunk_right_context_final=0
-# decode options
-extra_left_context=50
-extra_right_context=0
-extra_left_context_initial=0
-extra_right_context_final=0
-frames_per_chunk=140,100,160
-frames_per_chunk_primary=140
-
-# The rest are configs specific to this script.  Most of the parameters
-# are just hardcoded at this level, in the commands below.
-train_stage=-10
-tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_lstm_affix=1t  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                             --nnet3-affix "$nnet3_affix"
-
-
-gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
-train_data_dir=data/${train_set}_sp_hires_comb
-lores_train_data_dir=data/${train_set}_sp_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
-
-
-for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-if [ $stage -le 14 ]; then
-  echo "$0: creating lang directory with one state per phone."
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  if [ -d data/lang_chain ]; then
-    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
-      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
-    else
-      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
-      echo " ... not sure what to do.  Exiting."
-      exit 1;
-    fi
-  else
-    cp -r data/lang data/lang_chain
-    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
-    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
-    # Use our special topology... note that later on may have to tune this
-    # topology.
-    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
-  fi
-fi
-
-if [ $stage -le 15 ]; then
-  # Get the alignments as lattices (gives the chain training more freedom).
-  # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang $gmm_dir $lat_dir
-  rm $lat_dir/fsts.*.gz # save space
-fi
-
-if [ $stage -le 16 ]; then
-  # Build a tree using our new topology.  We know we have alignments for the
-  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
-  # those.
-  if [ -f $tree_dir/final.mdl ]; then
-    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
-    exit 1;
-  fi
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --leftmost-questions-truncate -1 \
-      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
-fi
-
-
-if [ $stage -le 17 ]; then
-  mkdir -p $dir
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-
-  # note: the value of the dropout-proportion is not important, as it's
-  # controlled by the dropout schedule; what's important is that we set it.
-  lstmp_opts="decay-time=20"
-
-  mkdir -p $dir/configs
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-renorm-layer name=tdnn1 dim=512
-  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
-  lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
-  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
-  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
-  lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
-  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
-  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
-  lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
-
-  ## adding the layers for chain branch
-  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
-
-  # adding the layers for xent branch
-  # This block prints the configs for a separate output that will be
-  # trained with a cross-entropy objective in the 'chain' models... this
-  # has the effect of regularizing the hidden parts of the model.  we use
-  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
-  # 0.5 / args.xent_regularize is suitable as it means the xent
-  # final-layer learns at a rate independent of the regularization
-  # constant; and the 0.5 was tuned so as to make the relative progress
-  # similar in the xent and regular final layers.
-  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
-
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-
-if [ $stage -le 18 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
- steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "$decode_cmd" \
-    --feat.online-ivector-dir $train_ivector_dir \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize 0.1 \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.00005 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width "$frames_per_chunk" \
-    --egs.chunk-left-context "$chunk_left_context" \
-    --egs.chunk-right-context "$chunk_right_context" \
-    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
-    --egs.chunk-right-context-final "$chunk_right_context_final" \
-    --trainer.num-chunk-per-minibatch 128,64 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.max-param-change 2.0 \
-    --trainer.num-epochs 4 \
-    --trainer.deriv-truncate-margin 10 \
-    --trainer.optimization.shrink-value 0.99 \
-    --trainer.optimization.num-jobs-initial 2 \
-    --trainer.optimization.num-jobs-final 12 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.optimization.momentum 0.0 \
-    --cleanup.remove-egs true \
-    --feat-dir $train_data_dir \
-    --tree-dir $tree_dir \
-    --lat-dir $lat_dir \
-    --dir $dir \
-    --cleanup=false
- # --cleanup=false is temporary while debugging.
-fi
-
-
-
-if [ $stage -le 19 ]; then
-  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
-fi
-
-if [ $stage -le 20 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for dset in dev test; do
-      (
-      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context $extra_left_context  \
-          --extra-right-context $extra_right_context  \
-          --extra-left-context-initial $extra_left_context_initial \
-          --extra-right-context-final $extra_right_context_final \
-          --frames-per-chunk "$frames_per_chunk_primary" \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-          --scoring-opts "--min-lmwt 5 " \
-         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-if [ $stage -le 21 ]; then
-  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
-  # so it will take a bit longer as the --num-threads option is not supported.
-  # we just hardcode the --frames-per-chunk option as it doesn't have to
-  # match any value used in training, and it won't affect the results (unlike
-  # regular decoding).
-  rm $dir/.error 2>/dev/null || true
-  for dset in dev test; do
-      (
-      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context-initial $extra_left_context_initial \
-          --frames-per-chunk 30 \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-          --scoring-opts "--min-lmwt 5 " \
-         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
deleted file mode 100755
index eda096b487b..00000000000
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
+++ /dev/null
@@ -1,385 +0,0 @@
-#!/bin/bash
-
-# 1u is as 1s, but adding dropout-per-frame=true.
-# Slightly better than 1s, but the improvement versus the baseline 1t is
-# rather disappointing (only about 0.4 at most).
-
-# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi exp/chain_cleaned/tdnn_lstm1s_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi exp/chain_cleaned/tdnn_lstm1u_sp_bi
-# System                tdnn_lstm1e_sp_bi tdnn_lstm1p2_sp_bi tdnn_lstm1s_sp_bi tdnn_lstm1t_sp_bi tdnn_lstm1u_sp_bi
-# WER on dev(orig)            9.0       8.7       9.1       9.2       9.0
-#         [looped:]           9.0       8.7       9.1       9.2       8.9
-# WER on dev(rescored)        8.4       8.2       8.3       8.6       8.1
-#         [looped:]           8.4       8.2       8.3       8.6       8.1
-# WER on test(orig)           8.8       8.8       9.0       9.1       8.7
-#         [looped:]           8.8       8.8       9.0       9.0       8.7
-# WER on test(rescored)       8.4       8.3       8.4       8.6       8.3
-#         [looped:]           8.3       8.3       8.4       8.7       8.3
-# Final train prob        -0.0648   -0.0717   -0.0693   -0.0618   -0.0723
-# Final valid prob        -0.0827   -0.0833   -0.0859   -0.0794   -0.0828
-# Final train prob (xent)   -0.8372   -0.8979   -0.8802   -0.8120   -0.9042
-# Final valid prob (xent)   -0.9497   -0.9844   -0.9934   -0.9396   -0.9879
-
-# 1s is as 1p, but reverting to the non-fast LSTM code (still with dropout);
-# will do 1t as the baseline without dropout.
-
-# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng
-# did it in the non-fast LSTMs, with separate per-frame masks on
-# the i and f component.  Using dropout schedule that maxes out at
-# 0.3, which he found worked best for that type of dropout.
-
-# [See about 20 lines below for the original comparison with the baseline,
-#  done when "p" was dropping out 2 gates [the i and f gates].
-#  The comparison directly below is between the version that dropped out
-#  2 gates (p) with the one that dropped out 3 gates (p2).  No consistent
-#  difference there.]
-# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi
-#_sp_bi
-# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi
-# System                tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi
-# WER on dev(orig)            8.9       8.7
-# WER on dev(rescored)        8.4       8.2
-# WER on test(orig)           8.7       8.8
-# WER on test(rescored)       8.1       8.3
-# Final train prob        -0.0712   -0.0717
-# Final valid prob        -0.0848   -0.0834
-# Final train prob (xent)   -0.8903   -0.9147
-# Final valid prob (xent)   -0.9719   -0.9977
-
-#
-#
-# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi
-# System                tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi
-# WER on dev(orig)            9.0       8.7       8.9       9.1
-#         [looped:]           9.0       8.6       8.8       9.0
-# WER on dev(rescored)        8.4       7.9       8.4       8.3
-#         [looped:]           8.4       7.8       8.3       8.2
-# WER on test(orig)           8.8       8.8       8.7       8.9
-#         [looped:]           8.8       8.7       8.6       8.9
-# WER on test(rescored)       8.4       8.3       8.1       8.3
-#         [looped:]           8.3       8.3       8.1       8.3
-# Final train prob        -0.0648   -0.0693   -0.0712   -0.0698
-# Final valid prob        -0.0827   -0.0854   -0.0848   -0.0875
-# Final train prob (xent)   -0.8372   -0.8848   -0.8903   -0.8721
-# Final valid prob (xent)   -0.9497   -0.9895   -0.9719   -0.9828
-#
-# 1k is as 1e, but introducing a dropout schedule.
-
-# 1e is as 1b, but reducing decay-time from 40 to 20.
-
-# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
-# uses egs from 1b, remember to remove that before I commit.
-
-# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
-# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
-
-# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
-# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
-# better results.  Note: these results are not with the updated LM (the LM data-prep
-# for this setup was changed in Nov 2016 but this was with an older directory).
-#
-# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
-# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
-# WER on dev(orig)          10.3      10.7       9.7
-# WER on dev(rescored)       9.8      10.1       9.3
-# WER on test(orig)           9.7       9.8       9.1
-# WER on test(rescored)       9.2       9.4       8.7
-# Final train prob        -0.0812   -0.0862   -0.0625
-# Final valid prob        -0.1049   -0.1047   -0.0910
-# Final train prob (xent)   -1.1334   -1.1763   -0.8518
-# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
-
-## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
-## otherwise call it directly in its location).
-# by default, with cleanup:
-# local/chain/run_tdnn_lstm.sh
-
-# without cleanup:
-# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
-
-# note, if you have already run one of the non-chain nnet3 systems
-# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
-
-# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
-# standard, LSTM, except that some TDNN layers were added in between the
-# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
-# this isn't exactly copied from there.
-
-
-set -e -o pipefail
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=0
-nj=30
-decode_nj=30
-min_seg_len=1.55
-label_delay=5
-xent_regularize=0.1
-train_set=train_cleaned
-gmm=tri3_cleaned  # the gmm for the target data
-num_threads_ubm=32
-nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-# training options
-chunk_left_context=40
-chunk_right_context=0
-chunk_left_context_initial=0
-chunk_right_context_final=0
-# decode options
-extra_left_context=50
-extra_right_context=0
-extra_left_context_initial=0
-extra_right_context_final=0
-frames_per_chunk=140,100,160
-frames_per_chunk_primary=140
-
-# The rest are configs specific to this script.  Most of the parameters
-# are just hardcoded at this level, in the commands below.
-train_stage=-10
-tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_lstm_affix=1u  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                             --nnet3-affix "$nnet3_affix"
-
-
-gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
-train_data_dir=data/${train_set}_sp_hires_comb
-lores_train_data_dir=data/${train_set}_sp_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
-
-
-for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-if [ $stage -le 14 ]; then
-  echo "$0: creating lang directory with one state per phone."
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  if [ -d data/lang_chain ]; then
-    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
-      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
-    else
-      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
-      echo " ... not sure what to do.  Exiting."
-      exit 1;
-    fi
-  else
-    cp -r data/lang data/lang_chain
-    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
-    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
-    # Use our special topology... note that later on may have to tune this
-    # topology.
-    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
-  fi
-fi
-
-if [ $stage -le 15 ]; then
-  # Get the alignments as lattices (gives the chain training more freedom).
-  # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang $gmm_dir $lat_dir
-  rm $lat_dir/fsts.*.gz # save space
-fi
-
-if [ $stage -le 16 ]; then
-  # Build a tree using our new topology.  We know we have alignments for the
-  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
-  # those.
-  if [ -f $tree_dir/final.mdl ]; then
-    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
-    exit 1;
-  fi
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --leftmost-questions-truncate -1 \
-      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
-fi
-
-
-if [ $stage -le 17 ]; then
-  mkdir -p $dir
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-
-  # note: the value of the dropout-proportion is not important, as it's
-  # controlled by the dropout schedule; what's important is that we set it.
-  lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-per-frame=True"
-
-  mkdir -p $dir/configs
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-renorm-layer name=tdnn1 dim=512
-  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
-  lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
-  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
-  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
-  lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
-  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
-  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
-  lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
-
-  ## adding the layers for chain branch
-  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
-
-  # adding the layers for xent branch
-  # This block prints the configs for a separate output that will be
-  # trained with a cross-entropy objective in the 'chain' models... this
-  # has the effect of regularizing the hidden parts of the model.  we use
-  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
-  # 0.5 / args.xent_regularize is suitable as it means the xent
-  # final-layer learns at a rate independent of the regularization
-  # constant; and the 0.5 was tuned so as to make the relative progress
-  # similar in the xent and regular final layers.
-  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
-
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-
-if [ $stage -le 18 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
- steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "$decode_cmd" \
-    --feat.online-ivector-dir $train_ivector_dir \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --trainer.dropout-schedule='0,0@0.20,0.3@0.5,0@0.75,0' \
-    --chain.xent-regularize 0.1 \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.00005 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width "$frames_per_chunk" \
-    --egs.chunk-left-context "$chunk_left_context" \
-    --egs.chunk-right-context "$chunk_right_context" \
-    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
-    --egs.chunk-right-context-final "$chunk_right_context_final" \
-    --trainer.num-chunk-per-minibatch 128,64 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.max-param-change 2.0 \
-    --trainer.num-epochs 4 \
-    --trainer.deriv-truncate-margin 10 \
-    --trainer.optimization.shrink-value 0.99 \
-    --trainer.optimization.num-jobs-initial 2 \
-    --trainer.optimization.num-jobs-final 12 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.optimization.momentum 0.0 \
-    --cleanup.remove-egs true \
-    --feat-dir $train_data_dir \
-    --tree-dir $tree_dir \
-    --lat-dir $lat_dir \
-    --dir $dir \
-    --cleanup=false
- # --cleanup=false is temporary while debugging.
-fi
-
-
-
-if [ $stage -le 19 ]; then
-  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
-fi
-
-if [ $stage -le 20 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for dset in dev test; do
-      (
-      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context $extra_left_context  \
-          --extra-right-context $extra_right_context  \
-          --extra-left-context-initial $extra_left_context_initial \
-          --extra-right-context-final $extra_right_context_final \
-          --frames-per-chunk "$frames_per_chunk_primary" \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-          --scoring-opts "--min-lmwt 5 " \
-         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-if [ $stage -le 21 ]; then
-  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
-  # so it will take a bit longer as the --num-threads option is not supported.
-  # we just hardcode the --frames-per-chunk option as it doesn't have to
-  # match any value used in training, and it won't affect the results (unlike
-  # regular decoding).
-  rm $dir/.error 2>/dev/null || true
-  for dset in dev test; do
-      (
-      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context-initial $extra_left_context_initial \
-          --frames-per-chunk 30 \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-          --scoring-opts "--min-lmwt 5 " \
-         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u_1024.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u_1024.sh
deleted file mode 100644
index e6a44bd0bc8..00000000000
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u_1024.sh
+++ /dev/null
@@ -1,387 +0,0 @@
-#!/bin/bash
-
-# 1u is as 1s, but adding dropout-per-frame=true.
-# Slightly better than 1s, but the improvement versus the baseline 1t is
-# rather disappointing (only about 0.4 at most).
-
-# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi exp/chain_cleaned/tdnn_lstm1s_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi exp/chain_cleaned/tdnn_lstm1u_sp_bi
-# System                tdnn_lstm1e_sp_bi tdnn_lstm1p2_sp_bi tdnn_lstm1s_sp_bi tdnn_lstm1t_sp_bi tdnn_lstm1u_sp_bi
-# WER on dev(orig)            9.0       8.7       9.1       9.2       9.0
-#         [looped:]           9.0       8.7       9.1       9.2       8.9
-# WER on dev(rescored)        8.4       8.2       8.3       8.6       8.1
-#         [looped:]           8.4       8.2       8.3       8.6       8.1
-# WER on test(orig)           8.8       8.8       9.0       9.1       8.7
-#         [looped:]           8.8       8.8       9.0       9.0       8.7
-# WER on test(rescored)       8.4       8.3       8.4       8.6       8.3
-#         [looped:]           8.3       8.3       8.4       8.7       8.3
-# Final train prob        -0.0648   -0.0717   -0.0693   -0.0618   -0.0723
-# Final valid prob        -0.0827   -0.0833   -0.0859   -0.0794   -0.0828
-# Final train prob (xent)   -0.8372   -0.8979   -0.8802   -0.8120   -0.9042
-# Final valid prob (xent)   -0.9497   -0.9844   -0.9934   -0.9396   -0.9879
-
-# 1s is as 1p, but reverting to the non-fast LSTM code (still with dropout);
-# will do 1t as the baseline without dropout.
-
-# 1p is as 1k, but [via script changes] doing the dropout as Gaofeng
-# did it in the non-fast LSTMs, with separate per-frame masks on
-# the i and f component.  Using dropout schedule that maxes out at
-# 0.3, which he found worked best for that type of dropout.
-
-# [See about 20 lines below for the original comparison with the baseline,
-#  done when "p" was dropping out 2 gates [the i and f gates].
-#  The comparison directly below is between the version that dropped out
-#  2 gates (p) with the one that dropped out 3 gates (p2).  No consistent
-#  difference there.]
-# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1{p,p2}_sp_bi
-#_sp_bi
-# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1p_sp_bi exp/chain_cleaned/tdnn_lstm1p2_sp_bi
-# System                tdnn_lstm1p_sp_bi tdnn_lstm1p2_sp_bi
-# WER on dev(orig)            8.9       8.7
-# WER on dev(rescored)        8.4       8.2
-# WER on test(orig)           8.7       8.8
-# WER on test(rescored)       8.1       8.3
-# Final train prob        -0.0712   -0.0717
-# Final valid prob        -0.0848   -0.0834
-# Final train prob (xent)   -0.8903   -0.9147
-# Final valid prob (xent)   -0.9719   -0.9977
-
-#
-#
-# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,p,q}_sp_bi
-# System                tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1p_sp_bi tdnn_lstm1q_sp_bi
-# WER on dev(orig)            9.0       8.7       8.9       9.1
-#         [looped:]           9.0       8.6       8.8       9.0
-# WER on dev(rescored)        8.4       7.9       8.4       8.3
-#         [looped:]           8.4       7.8       8.3       8.2
-# WER on test(orig)           8.8       8.8       8.7       8.9
-#         [looped:]           8.8       8.7       8.6       8.9
-# WER on test(rescored)       8.4       8.3       8.1       8.3
-#         [looped:]           8.3       8.3       8.1       8.3
-# Final train prob        -0.0648   -0.0693   -0.0712   -0.0698
-# Final valid prob        -0.0827   -0.0854   -0.0848   -0.0875
-# Final train prob (xent)   -0.8372   -0.8848   -0.8903   -0.8721
-# Final valid prob (xent)   -0.9497   -0.9895   -0.9719   -0.9828
-#
-# 1k is as 1e, but introducing a dropout schedule.
-
-# 1e is as 1b, but reducing decay-time from 40 to 20.
-
-# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
-# uses egs from 1b, remember to remove that before I commit.
-
-# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
-# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
-
-# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
-# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
-# better results.  Note: these results are not with the updated LM (the LM data-prep
-# for this setup was changed in Nov 2016 but this was with an older directory).
-#
-# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
-# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
-# WER on dev(orig)          10.3      10.7       9.7
-# WER on dev(rescored)       9.8      10.1       9.3
-# WER on test(orig)           9.7       9.8       9.1
-# WER on test(rescored)       9.2       9.4       8.7
-# Final train prob        -0.0812   -0.0862   -0.0625
-# Final valid prob        -0.1049   -0.1047   -0.0910
-# Final train prob (xent)   -1.1334   -1.1763   -0.8518
-# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
-
-## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
-## otherwise call it directly in its location).
-# by default, with cleanup:
-# local/chain/run_tdnn_lstm.sh
-
-# without cleanup:
-# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
-
-# note, if you have already run one of the non-chain nnet3 systems
-# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
-
-# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
-# standard, LSTM, except that some TDNN layers were added in between the
-# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
-# this isn't exactly copied from there.
-
-
-set -e -o pipefail
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=17
-nj=30
-decode_nj=30
-min_seg_len=1.55
-label_delay=5
-xent_regularize=0.1
-train_set=train_cleaned
-gmm=tri3_cleaned  # the gmm for the target data
-num_threads_ubm=32
-nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-dropout_schedule=
-num_epoch=
-# training options
-chunk_left_context=40
-chunk_right_context=0
-chunk_left_context_initial=0
-chunk_right_context_final=0
-# decode options
-extra_left_context=50
-extra_right_context=0
-extra_left_context_initial=0
-extra_right_context_final=0
-frames_per_chunk=140,100,160
-frames_per_chunk_primary=140
-
-# The rest are configs specific to this script.  Most of the parameters
-# are just hardcoded at this level, in the commands below.
-train_stage=-10
-tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_lstm_affix=1u  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                             --nnet3-affix "$nnet3_affix"
-
-
-gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
-train_data_dir=data/${train_set}_sp_hires_comb
-lores_train_data_dir=data/${train_set}_sp_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
-
-
-for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-if [ $stage -le 14 ]; then
-  echo "$0: creating lang directory with one state per phone."
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  if [ -d data/lang_chain ]; then
-    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
-      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
-    else
-      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
-      echo " ... not sure what to do.  Exiting."
-      exit 1;
-    fi
-  else
-    cp -r data/lang data/lang_chain
-    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
-    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
-    # Use our special topology... note that later on may have to tune this
-    # topology.
-    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
-  fi
-fi
-
-if [ $stage -le 15 ]; then
-  # Get the alignments as lattices (gives the chain training more freedom).
-  # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang $gmm_dir $lat_dir
-  rm $lat_dir/fsts.*.gz # save space
-fi
-
-if [ $stage -le 16 ]; then
-  # Build a tree using our new topology.  We know we have alignments for the
-  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
-  # those.
-  if [ -f $tree_dir/final.mdl ]; then
-    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
-    exit 1;
-  fi
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --leftmost-questions-truncate -1 \
-      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
-fi
-
-
-if [ $stage -le 17 ]; then
-  mkdir -p $dir
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-
-  # note: the value of the dropout-proportion is not important, as it's
-  # controlled by the dropout schedule; what's important is that we set it.
-  lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-per-frame=True"
-
-  mkdir -p $dir/configs
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-renorm-layer name=tdnn1 dim=1024
-  relu-renorm-layer name=tdnn2 dim=1024 input=Append(-1,0,1)
-  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstmp_opts
-  relu-renorm-layer name=tdnn3 dim=1024 input=Append(-3,0,3)
-  relu-renorm-layer name=tdnn4 dim=1024 input=Append(-3,0,3)
-  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstmp_opts
-  relu-renorm-layer name=tdnn5 dim=1024 input=Append(-3,0,3)
-  relu-renorm-layer name=tdnn6 dim=1024 input=Append(-3,0,3)
-  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstmp_opts
-
-  ## adding the layers for chain branch
-  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
-
-  # adding the layers for xent branch
-  # This block prints the configs for a separate output that will be
-  # trained with a cross-entropy objective in the 'chain' models... this
-  # has the effect of regularizing the hidden parts of the model.  we use
-  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
-  # 0.5 / args.xent_regularize is suitable as it means the xent
-  # final-layer learns at a rate independent of the regularization
-  # constant; and the 0.5 was tuned so as to make the relative progress
-  # similar in the xent and regular final layers.
-  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
-
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-
-if [ $stage -le 18 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
- steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "$decode_cmd" \
-    --feat.online-ivector-dir $train_ivector_dir \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --trainer.dropout-schedule="$dropout_schedule" \
-    --chain.xent-regularize 0.1 \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.00005 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width "$frames_per_chunk" \
-    --egs.chunk-left-context "$chunk_left_context" \
-    --egs.chunk-right-context "$chunk_right_context" \
-    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
-    --egs.chunk-right-context-final "$chunk_right_context_final" \
-    --trainer.num-chunk-per-minibatch 128,64 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.max-param-change 2.0 \
-    --trainer.num-epochs $num_epoch \
-    --trainer.deriv-truncate-margin 10 \
-    --trainer.optimization.shrink-value 0.99 \
-    --trainer.optimization.num-jobs-initial 2 \
-    --trainer.optimization.num-jobs-final 12 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.optimization.momentum 0.0 \
-    --cleanup.remove-egs true \
-    --feat-dir $train_data_dir \
-    --tree-dir $tree_dir \
-    --lat-dir $lat_dir \
-    --dir $dir \
-    --cleanup=false
- # --cleanup=false is temporary while debugging.
-fi
-
-
-
-if [ $stage -le 19 ]; then
-  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
-fi
-
-if [ $stage -le 20 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for dset in dev test; do
-      (
-      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context $extra_left_context  \
-          --extra-right-context $extra_right_context  \
-          --extra-left-context-initial $extra_left_context_initial \
-          --extra-right-context-final $extra_right_context_final \
-          --frames-per-chunk "$frames_per_chunk_primary" \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-          --scoring-opts "--min-lmwt 5 " \
-         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-if [ $stage -le 21 ]; then
-  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
-  # so it will take a bit longer as the --num-threads option is not supported.
-  # we just hardcode the --frames-per-chunk option as it doesn't have to
-  # match any value used in training, and it won't affect the results (unlike
-  # regular decoding).
-  rm $dir/.error 2>/dev/null || true
-  for dset in dev test; do
-      (
-      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context-initial $extra_left_context_initial \
-          --frames-per-chunk 30 \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-          --scoring-opts "--min-lmwt 5 " \
-         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-exit 0
diff --git a/egs/tedlium/s5_r2/tdnn_lstm_1u_newschedule_5epoch_1024.sh b/egs/tedlium/s5_r2/tdnn_lstm_1u_newschedule_5epoch_1024.sh
deleted file mode 100644
index d41fb4f82c2..00000000000
--- a/egs/tedlium/s5_r2/tdnn_lstm_1u_newschedule_5epoch_1024.sh
+++ /dev/null
@@ -1 +0,0 @@
-local/chain/tuning/run_tdnn_lstm_1u_1024.sh --train-stage 68 --dropout-schedule "0,0@0.20,0.3@0.5,0" --num-epoch 5 --tdnn-lstm-affix 1u_newschedule_5epoch_1024

From e9ac4e2343805f4f38e824f0dfd65cd9cca7dc1b Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Sun, 9 Apr 2017 10:39:25 +0800
Subject: [PATCH 08/21] delete irrelevant file

---
 .vscode/settings.json | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 .vscode/settings.json

diff --git a/.vscode/settings.json b/.vscode/settings.json
deleted file mode 100644
index fe7159848bd..00000000000
--- a/.vscode/settings.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "python.linting.pylintEnabled": false
-}
\ No newline at end of file

From 638f0834c85efaf476f14d91f33d97412a039fea Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Sun, 9 Apr 2017 11:13:04 +0800
Subject: [PATCH 09/21] delete exclusive option in fast lstm code

---
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py |  2 --
 src/nnet3/nnet-general-component.cc         | 22 +++------------------
 src/nnet3/nnet-general-component.h          |  9 +--------
 3 files changed, 4 insertions(+), 29 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 55ac4704c0a..9d95e41ab12 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -720,7 +720,6 @@ def set_default_configs(self):
                                                      # be used (note: this is
                                                      # per-frame dropout on the
                                                      # output of the i_t and f_t gates)
-                        'dropout-exclusive' : False  # option affecting dropout masks.
                          }
 
     def set_derived_configs(self):
@@ -820,7 +819,6 @@ def generate_lstm_config(self):
 
         lstm_str = self.config['lstm-nonlinearity-options']
         dropout_proportion = self.config['dropout-proportion']
-        dropout_exclusive = 'true' if self.config['dropout-exclusive'] else 'false'
 
         configs = []
 
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index 3f47e1e01d2..761ffbd6815 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1380,20 +1380,17 @@ std::string DropoutMaskComponent::Info() const {
   std::ostringstream stream;
   stream << Type()
          << ", output-dim=" << output_dim_
-         << ", dropout-proportion=" << dropout_proportion_
-         << ", exclusive=" << (exclusive_ ? "true" : "false");
+         << ", dropout-proportion=" << dropout_proportion_;
   return stream.str();
 }
 
 DropoutMaskComponent::DropoutMaskComponent():
-    output_dim_(-1), dropout_proportion_(0.5),
-    exclusive_(false) { }
+    output_dim_(-1), dropout_proportion_(0.5) { }
 
 DropoutMaskComponent::DropoutMaskComponent(
     const DropoutMaskComponent &other):
     output_dim_(other.output_dim_),
-    dropout_proportion_(other.dropout_proportion_),
-    exclusive_(other.exclusive_) { }
+    dropout_proportion_(other.dropout_proportion_) { }
 
 void DropoutMaskComponent::Propagate(
     const ComponentPrecomputedIndexes *indexes,
@@ -1407,16 +1404,9 @@ void DropoutMaskComponent::Propagate(
     out->Set(1.0);
     return;
   }
-  if (!exclusive_) {
     const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
     out->Add(-dropout_proportion);
     out->ApplyHeaviside();
-  } else {
-    if (!(output_dim_ == 2 && dropout_proportion <= 0.5)) {
-      KALDI_ERR << "If exclusive=true is set, output-dim must equal 2 (got: "
-                << output_dim_ << " and dropout-proportion must <= 0.5 (got: "
-                << dropout_proportion;
-    }
     // To generate data where it's never the case that both of the dimensions
     // for a row are zero, we generate uniformly distributed data (call this u_i),
     // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1)
@@ -1442,8 +1432,6 @@ void DropoutMaskComponent::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &output_dim_);
   ExpectToken(is, binary, "<DropoutProportion>");
   ReadBasicType(is, binary, &dropout_proportion_);
-  ExpectToken(is, binary, "<Exclusive>");
-  ReadBasicType(is, binary, &exclusive_);
   ExpectToken(is, binary, "</DropoutMaskComponent>");
 }
 
@@ -1454,8 +1442,6 @@ void DropoutMaskComponent::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, output_dim_);
   WriteToken(os, binary, "<DropoutProportion>");
   WriteBasicType(os, binary, dropout_proportion_);
-  WriteToken(os, binary, "<Exclusive>");
-  WriteBasicType(os, binary, exclusive_);
   WriteToken(os, binary, "</DropoutMaskComponent>");
 }
 
@@ -1469,8 +1455,6 @@ void DropoutMaskComponent::InitFromConfig(ConfigLine *cfl) {
   KALDI_ASSERT(ok && output_dim_ > 0);
   dropout_proportion_ = 0.5;
   cfl->GetValue("dropout-proportion", &dropout_proportion_);
-  exclusive_ = false;
-  cfl->GetValue("exclusive", &exclusive_);
 }
 
 
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index d3de9f40548..d5d7a140177 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -689,8 +689,7 @@ class DropoutMaskComponent: public RandomComponent {
   virtual std::string Info() const;
 
   // possible parameter values with their defaults:
-  // dropout-proportion=0.5 output-dim=-1 exclusive=false
-  // [for the meaning of 'exclusive', see its declaration].
+  // dropout-proportion=0.5 output-dim=-1
   virtual void InitFromConfig(ConfigLine *cfl);
 
   DropoutMaskComponent();
@@ -745,12 +744,6 @@ class DropoutMaskComponent: public RandomComponent {
 
   BaseFloat dropout_proportion_;
 
-  // If true, and only in the special case where output_dim_ == 2, this
-  // component will make sure that it's never the case that both columns of a
-  // row of the output are zero.  Note: if this is true, you cannot set
-  // dropout_proportion_ > 0.5.
-  bool exclusive_;
-
   const DropoutMaskComponent &operator
   = (const DropoutMaskComponent &other); // Disallow.
 };

From 49c4558c27b10a89c0337e4d9d7779dca1424070 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Sun, 9 Apr 2017 13:10:12 +0800
Subject: [PATCH 10/21] solve some cuda-kernel line mismatch problem

---
 src/cudamatrix/cu-kernels-ansi.h | 130 ++++++++++---------------------
 src/cudamatrix/cu-kernels.h      | 107 ++++++-------------------
 2 files changed, 63 insertions(+), 174 deletions(-)

diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index cbdecfb7386..5b72a62e716 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -330,6 +330,7 @@ void cudaF_diff_log_softmax(dim3 Gr, dim3 Bl, const MatrixDim in_deriv_dim,
                             const float* out_deriv, const int out_deriv_stride,
                             float* in_deriv);
 void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                  const int have_dropout_mask,
                                   const int num_rows, const double* input,
                                   const int in_stride, const double* params,
                                   const int params_stride,
@@ -349,6 +350,7 @@ void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
                                   double* self_repair_sum_out,
                                   const int self_repair_sum_out_stride);
 void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                  const int have_dropout_mask,
                                   const int num_rows, const float* input,
                                   const int in_stride, const float* params,
                                   const int params_stride,
@@ -455,12 +457,14 @@ void cudaF_log_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x,
 void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
                              const int in_stride, const double* params,
                              const int params_stride, const int out_stride,
-                             const int cell_dim, const int num_rows,
+                             const int cell_dim, const int have_dropout_mask,
+                             const int num_rows,
                              double* out);
 void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
                              const int in_stride, const float* params,
                              const int params_stride, const int out_stride,
-                             const int cell_dim, const int num_rows,
+                             const int cell_dim, const int have_dropout_mask,
+                             const int num_rows,
                              float* out);
 void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
                                double alpha, MatrixElement<double>* x,
@@ -636,93 +640,41 @@ void cudaD_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const double* mat_in,
                                 const MatrixElement<double>* smat_in,
                                 MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
                                 double* trace_vec_out);
-
-void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
-                               double alpha, MatrixElement<double>* x,
-                               int num_elements);
-void cudaD_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
-                                     double alpha, const Int32Pair* indices,
-                                     const double* x, int s, double* data);
-void cudaD_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<double>* x, int s,
-                          const double* z, MatrixDim d, double* z2,
-                          MatrixDim d2, double* t);
-
-void cudaD_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta,
-                      const double* T, MatrixDim tdim, double *S,
-                      MatrixDim sdim);
-void cudaD_sum_column_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
-                             const double *src_data, MatrixDim src_dim,
-                             const Int32Pair *indices);
-void cudaD_add_row_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
-                          const double *src_data, MatrixDim src_dim,
-                          const Int32Pair *indexes);
-void cudaD_matrix_lookup(dim3 Gr, dim3 Bl, const double *data, MatrixDim dim,
-                         const Int32Pair *indices, int indices_size,
-                         double *output);
-
-void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
-                              const double *mat2, double *mask,
-                              MatrixDim mat1_dim, int mat2_stride,
-                              int mask_stride);
-
-void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
-                             const int in_stride, const double* params,
-                             const int params_stride, const int out_stride,
-                             const int cell_dim, const int have_dropout_mask,
-                             const int num_rows,
-                             double* out);
-void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
-                             const int in_stride, const float* params,
-                             const int params_stride, const int out_stride,
-                             const int cell_dim, const int have_dropout_mask,
-                             const int num_rows,
-                             float* out);
-void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
-                                  const int have_dropout_mask,
-                                  const int num_rows, const double* input,
-                                  const int in_stride, const double* params,
-                                  const int params_stride,
-                                  const double* output_deriv,
-                                  const int output_deriv_stride,
-                                  const double* deriv_sum_in,
-                                  const int deriv_sum_in_stride,
-                                  const double* self_repair_config,
-                                  double count, double* input_deriv,
-                                  const int input_deriv_stride,
-                                  double* params_deriv,
-                                  const int params_deriv_stride,
-                                  double* value_sum_out,
-                                  const int value_sum_out_stride,
-                                  double* deriv_sum_out,
-                                  const int deriv_sum_out_stride,
-                                  double* self_repair_sum_out,
-                                  const int self_repair_sum_out_stride);
-void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
-                                  const int have_dropout_mask,
-                                  const int num_rows, const float* input,
-                                  const int in_stride, const float* params,
-                                  const int params_stride,
-                                  const float* output_deriv,
-                                  const int output_deriv_stride,
-                                  const double* deriv_sum_in,
-                                  const int deriv_sum_in_stride,
-                                  const float* self_repair_config, double count,
-                                  float* input_deriv,
-                                  const int input_deriv_stride,
-                                  float* params_deriv,
-                                  const int params_deriv_stride,
-                                  double* value_sum_out,
-                                  const int value_sum_out_stride,
-                                  double* deriv_sum_out,
-                                  const int deriv_sum_out_stride,
-                                  float* self_repair_sum_out,
-                                  const int self_repair_sum_out_stride);
-
-
-void cudaD_copy_cols_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
-                              MatrixDim d_out, const double *v_in);
-void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out,
-                              const float *v_in);
+void cudaF_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const float* mat_in,
+                                const MatrixElement<float>* smat_in,
+                                MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
+                                float* trace_vec_out);
+void cudaD_vec_apply_ceiling(int Gr, int Bl, double* v, double ceiling_val,
+                             float* num, int dim);
+void cudaF_vec_apply_ceiling(int Gr, int Bl, float* v, float ceiling_val,
+                             float* num, int dim);
+void cudaD_vec_apply_exp(int Gr, int Bl, double* v, int dim);
+void cudaF_vec_apply_exp(int Gr, int Bl, float* v, int dim);
+void cudaD_vec_apply_floor(int Gr, int Bl, double* v, double floor_val,
+                           float* num, int dim);
+void cudaF_vec_apply_floor(int Gr, int Bl, float* v, float floor_val,
+                           float* num, int dim);
+void cudaD_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim);
+void cudaF_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim);
+void cudaD_vec_copy_diag_from_packed(int Gr, int Bl, double *dst,
+                                     const double *src, int dim);
+void cudaF_vec_copy_diag_from_packed(int Gr, int Bl, float *dst,
+                                     const float *src, int dim);
+void cudaD_vec_max(int Gr, int Bl, const double* v, double* value, int dim,
+                   int inc);
+void cudaF_vec_max(int Gr, int Bl, const float* v, float* value, int dim,
+                   int inc);
+void cudaD_vec_min(int Gr, int Bl, const double* v, double* value, int dim,
+                   int inc);
+void cudaF_vec_min(int Gr, int Bl, const float* v, float* value, int dim,
+                   int inc);
+void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a,
+                            int dim);
+void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim);
+void cudaD_vec_soft_max(int Gr, int Bl, double* v, int dim);
+void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim);
+void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc);
+void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc);
 
 } // extern "C"
 
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index a8d305c5bf4..d2a79f471c8 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -626,6 +626,7 @@ inline void cuda_diff_log_softmax(dim3 Gr, dim3 Bl,
                          out_deriv, out_deriv_stride, in_deriv);
 }
 inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                        const int have_dropout_mask,
                                         const int num_rows, const double* input,
                                         const int input_stride,
                                         const double* params,
@@ -645,7 +646,8 @@ inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
                                         const int deriv_sum_out_stride,
                                         double* self_repair_sum_out,
                                         const int self_repair_sum_out_stride) {
-  cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride,
+  cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask, num_rows,
+                               input, input_stride,
                                params, params_stride, output_deriv,
                                output_deriv_stride, deriv_sum_in,
                                deriv_sum_in_stride, self_repair_config, count,
@@ -656,6 +658,7 @@ inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
                                self_repair_sum_out_stride);
 }
 inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                        const int have_dropout_mask,
                                         const int num_rows, const float* input,
                                         const int input_stride,
                                         const float* params,
@@ -675,7 +678,8 @@ inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
                                         const int deriv_sum_out_stride,
                                         float* self_repair_sum_out,
                                         const int self_repair_sum_out_stride) {
-  cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride,
+  cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask,
+                               num_rows, input, input_stride,
                                params, params_stride, output_deriv,
                                output_deriv_stride, deriv_sum_in,
                                deriv_sum_in_stride, self_repair_config, count,
@@ -849,17 +853,21 @@ inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
                                    const int in_stride, const double* params,
                                    const int params_stride,
                                    const int out_stride, const int cell_dim,
+                                   const int have_dropout_mask,
                                    const int num_rows, double* out) {
   cudaD_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
-                          out_stride, cell_dim, num_rows, out);
+                          out_stride, cell_dim, have_dropout_mask,
+                          num_rows, out);
 }
 inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
                                    const int in_stride, const float* params,
                                    const int params_stride,
                                    const int out_stride, const int cell_dim,
+                                   const int have_dropout_mask,
                                    const int num_rows, float* out) {
   cudaF_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
-                          out_stride, cell_dim, num_rows, out);
+                          out_stride, cell_dim, have_dropout_mask,
+                          num_rows, out);
 }
 inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, double *data,
                                      MatrixDim dim, double alpha,
@@ -1300,90 +1308,19 @@ inline void cuda_vec_min(int Gr, int Bl, const float* v, float* value, int dim,
                          int inc) {
   cudaF_vec_min(Gr, Bl, v, value, dim, inc);
 }
-
-inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
-                                   const int in_stride, const double* params,
-                                   const int params_stride,
-                                   const int out_stride, const int cell_dim,
-                                   const int have_dropout_mask,
-                                   const int num_rows, double* out) {
-  cudaD_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
-                          out_stride, cell_dim, have_dropout_mask,
-                          num_rows, out);
+inline void cuda_vec_mul_elements(int Gr, int Bl, double* v, const double* a,
+                                  int dim) {
+  cudaD_vec_mul_elements(Gr, Bl, v, a, dim);
 }
-inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
-                                   const int in_stride, const float* params,
-                                   const int params_stride,
-                                   const int out_stride, const int cell_dim,
-                                   const int have_dropout_mask,
-                                   const int num_rows, float* out) {
-  cudaF_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
-                          out_stride, cell_dim, have_dropout_mask,
-                          num_rows, out);
+inline void cuda_vec_mul_elements(int Gr, int Bl, float* v, const float* a,
+                                  int dim) {
+  cudaF_vec_mul_elements(Gr, Bl, v, a, dim);
 }
-inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
-                                        const int have_dropout_mask,
-                                        const int num_rows, const double* input,
-                                        const int input_stride,
-                                        const double* params,
-                                        const int params_stride,
-                                        const double* output_deriv,
-                                        const int output_deriv_stride,
-                                        const double* deriv_sum_in,
-                                        const int deriv_sum_in_stride,
-                                        const double* self_repair_config,
-                                        double count, double* input_deriv,
-                                        const int input_deriv_stride,
-                                        double* params_deriv,
-                                        const int params_deriv_stride,
-                                        double* value_sum_out,
-                                        const int value_sum_out_stride,
-                                        double* deriv_sum_out,
-                                        const int deriv_sum_out_stride,
-                                        double* self_repair_sum_out,
-                                        const int self_repair_sum_out_stride) {
-  cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask, num_rows,
-                               input, input_stride,
-                               params, params_stride, output_deriv,
-                               output_deriv_stride, deriv_sum_in,
-                               deriv_sum_in_stride, self_repair_config, count,
-                               input_deriv, input_deriv_stride, params_deriv,
-                               params_deriv_stride, value_sum_out,
-                               value_sum_out_stride, deriv_sum_out,
-                               deriv_sum_out_stride, self_repair_sum_out,
-                               self_repair_sum_out_stride);
+inline void cuda_vec_soft_max(int Gr, int Bl, double* v, int dim) {
+  cudaD_vec_soft_max(Gr, Bl, v, dim);
 }
-inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
-                                        const int have_dropout_mask,
-                                        const int num_rows, const float* input,
-                                        const int input_stride,
-                                        const float* params,
-                                        const int params_stride,
-                                        const float* output_deriv,
-                                        const int output_deriv_stride,
-                                        const double* deriv_sum_in,
-                                        const int deriv_sum_in_stride,
-                                        const float* self_repair_config,
-                                        double count, float* input_deriv,
-                                        const int input_deriv_stride,
-                                        float* params_deriv,
-                                        const int params_deriv_stride,
-                                        double* value_sum_out,
-                                        const int value_sum_out_stride,
-                                        double* deriv_sum_out,
-                                        const int deriv_sum_out_stride,
-                                        float* self_repair_sum_out,
-                                        const int self_repair_sum_out_stride) {
-  cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, have_dropout_mask,
-                               num_rows, input, input_stride,
-                               params, params_stride, output_deriv,
-                               output_deriv_stride, deriv_sum_in,
-                               deriv_sum_in_stride, self_repair_config, count,
-                               input_deriv, input_deriv_stride, params_deriv,
-                               params_deriv_stride, value_sum_out,
-                               value_sum_out_stride, deriv_sum_out,
-                               deriv_sum_out_stride, self_repair_sum_out,
-                               self_repair_sum_out_stride);
+inline void cuda_vec_soft_max(int Gr, int Bl, float* v, int dim) {
+  cudaF_vec_soft_max(Gr, Bl, v, dim);
 }
 inline void cuda_vec_sum(int Gr, int Bl, double* v, double* value, int dim,
                          int inc) {

From 05fc6d250408048e7b6ecf63ed033c9e9b1de9a1 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Sun, 9 Apr 2017 14:44:53 +0800
Subject: [PATCH 11/21] small bug fix

---
 src/nnet3/nnet-general-component.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index 761ffbd6815..85743490518 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1424,7 +1424,6 @@ void DropoutMaskComponent::Propagate(
     out->CopyColFromVec(temp, 1);
     out->ApplyHeaviside();
   }
-}
 
 
 void DropoutMaskComponent::Read(std::istream &is, bool binary) {

From 90df5d7c81bb3af14edbadce1e5a7920f992681b Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Sun, 9 Apr 2017 15:02:45 +0800
Subject: [PATCH 12/21] small fix

---
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 9d95e41ab12..c92afb1c2dc 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -717,9 +717,7 @@ def set_default_configs(self):
                         'zeroing-interval' : 20,
                         'zeroing-threshold' : 15.0,
                         'dropout-proportion' : -1.0, # If -1.0, no dropout will
-                                                     # be used (note: this is
-                                                     # per-frame dropout on the
-                                                     # output of the i_t and f_t gates)
+                                                     # be used)
                          }
 
     def set_derived_configs(self):

From 1a5823672105c9c2790ea39fb9b50c84e3b61ed9 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Tue, 11 Apr 2017 09:28:15 +0800
Subject: [PATCH 13/21] update scripts for tdnn-(fast)lstm of AMI-IHM

---
 egs/ami/s5b/RESULTS_ihm                       |  14 +
 .../local/chain/tuning/run_tdnn_lstm_1l.sh    | 293 +++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1m.sh    | 299 ++++++++++++++++++
 3 files changed, 606 insertions(+)
 create mode 100644 egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
 create mode 100644 egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh

diff --git a/egs/ami/s5b/RESULTS_ihm b/egs/ami/s5b/RESULTS_ihm
index 25a60d24cfb..660fac9c200 100644
--- a/egs/ami/s5b/RESULTS_ihm
+++ b/egs/ami/s5b/RESULTS_ihm
@@ -84,6 +84,20 @@
 %WER 20.8 | 13098 94489 | 82.0 10.0 8.0 2.8 20.8 53.2 | -0.096 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_dev/ascore_11/dev_hires.ctm.filt.sys
 %WER 20.7 | 12643 89980 | 81.7 11.5 6.8 2.5 20.7 51.8 | 0.015 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_eval/ascore_11/eval_hires.ctm.filt.sys
 
+# local/chain/tuning/run_tdnn_lstm_1l.sh --mic ihm  --train-set train_cleaned  --gmm tri3_cleaned
+# cleanup + chain TDNN+LSTM model + dropout
+%WER 19.8 | 13098 94475 | 83.1 9.6 7.4 2.8 19.8 51.8 | -0.041 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_dev/ascore_10/dev_hires.ctm.filt.sys
+%WER 19.2 | 12643 89964 | 83.2 10.7 6.1 2.5 19.2 49.7 | 0.079 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_eval/ascore_10/eval_hires.ctm.filt.sys
+
+# local/chain/tuning/run_tdnn_lstm_1j.sh --mic ihm  --train-set train_cleaned  --gmm tri3_cleaned
+# cleanup + chain TDNN+fast-LSTM model
+%WER 20.8 | 13098 94485 | 82.1 10.3 7.6 3.0 20.8 53.0 | -0.140 | exp/ihm/chain_cleaned/tdnn_lstm1j_sp_bi_ld5/decode_dev/ascore_11/dev_hires.ctm.filt.sys
+%WER 20.3 | 12643 89982 | 82.3 11.4 6.3 2.6 20.3 51.1 | -0.035 | exp/ihm/chain_cleaned/tdnn_lstm1j_sp_bi_ld5/decode_eval/ascore_11/eval_hires.ctm.filt.sys
+
+# local/chain/tuning/run_tdnn_lstm_1m.sh --mic ihm  --train-set train_cleaned  --gmm tri3_cleaned
+# cleanup + chain TDNN+fast-LSTM model + dropout
+%WER 19.9 | 13098 94476 | 83.0 9.7 7.3 2.9 19.9 51.7 | -0.059 | exp/ihm/chain_cleaned/tdnn_lstm1m_sp_bi_ld5/decode_dev/ascore_10/dev_hires.ctm.filt.sys
+%WER 19.3 | 12643 89969 | 83.1 10.8 6.1 2.4 19.3 49.9 | 0.045 | exp/ihm/chain_cleaned/tdnn_lstm1m_sp_bi_ld5/decode_eval/ascore_10/eval_hires.ctm.filt.sys
 
 # local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh --mic ihm
 # cleanup + chain TDNN+LSTM model + IHM reverberated data
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
new file mode 100644
index 00000000000..02680b92f30
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -0,0 +1,293 @@
+#!/bin/bash
+
+# same as 1i but with per-frame dropout on LSTM layer
+#IHM
+#System               tdnn_lstm1i_sp_bi_ld5 tdnn_lstm1l_sp_bi_ld5
+#WER on dev        20.6      19.8
+#WER on eval        20.1      19.2
+#Final train prob      -0.045 -0.067
+#Final valid prob      -0.098 -0.098
+#Final train prob (xent)      -0.723  -0.916
+#Final valid prob (xent)      -1.04  -1.10
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tlstm_affix=1l  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+
+# decode options
+extra_left_context=50
+frames_per_chunk=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi
+fi
+
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=true
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=true
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=true
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
new file mode 100644
index 00000000000..395a6dff483
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -0,0 +1,299 @@
+#!/bin/bash
+
+# 1m is same as 1j but with the by-frame dropout fast-lstmp
+
+#IHM
+#System            tdnn_lstm1j_sp_bi_ld5 tdnn_lstm1m_sp_bi_ld5
+#WER on dev        20.8      19.9
+#WER on eval        20.3      19.3
+#Final train prob    -0.044   -0.065
+#Final valid prob     -0.107  -0.100
+#Final train prob (xent)      -0.684  -0.885
+#Final valid prob (xent)      -1.05  -1.09
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tlstm_affix=1m  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+
+# decode options
+extra_left_context=50
+frames_per_chunk=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi
+fi
+
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20 dropout-proportion=0.0"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0

From 69a36e46f9b2312f46a6e1cb7cb998e2fceca5a4 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Tue, 11 Apr 2017 09:53:56 +0800
Subject: [PATCH 14/21] change scripts comment style and RESULTS

---
 egs/ami/s5b/RESULTS_ihm                            | 10 ----------
 egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh | 13 ++++++++-----
 egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh | 13 +++++++++----
 3 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/egs/ami/s5b/RESULTS_ihm b/egs/ami/s5b/RESULTS_ihm
index 660fac9c200..6438f64a6c9 100644
--- a/egs/ami/s5b/RESULTS_ihm
+++ b/egs/ami/s5b/RESULTS_ihm
@@ -84,16 +84,6 @@
 %WER 20.8 | 13098 94489 | 82.0 10.0 8.0 2.8 20.8 53.2 | -0.096 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_dev/ascore_11/dev_hires.ctm.filt.sys
 %WER 20.7 | 12643 89980 | 81.7 11.5 6.8 2.5 20.7 51.8 | 0.015 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_eval/ascore_11/eval_hires.ctm.filt.sys
 
-# local/chain/tuning/run_tdnn_lstm_1l.sh --mic ihm  --train-set train_cleaned  --gmm tri3_cleaned
-# cleanup + chain TDNN+LSTM model + dropout
-%WER 19.8 | 13098 94475 | 83.1 9.6 7.4 2.8 19.8 51.8 | -0.041 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_dev/ascore_10/dev_hires.ctm.filt.sys
-%WER 19.2 | 12643 89964 | 83.2 10.7 6.1 2.5 19.2 49.7 | 0.079 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_eval/ascore_10/eval_hires.ctm.filt.sys
-
-# local/chain/tuning/run_tdnn_lstm_1j.sh --mic ihm  --train-set train_cleaned  --gmm tri3_cleaned
-# cleanup + chain TDNN+fast-LSTM model
-%WER 20.8 | 13098 94485 | 82.1 10.3 7.6 3.0 20.8 53.0 | -0.140 | exp/ihm/chain_cleaned/tdnn_lstm1j_sp_bi_ld5/decode_dev/ascore_11/dev_hires.ctm.filt.sys
-%WER 20.3 | 12643 89982 | 82.3 11.4 6.3 2.6 20.3 51.1 | -0.035 | exp/ihm/chain_cleaned/tdnn_lstm1j_sp_bi_ld5/decode_eval/ascore_11/eval_hires.ctm.filt.sys
-
 # local/chain/tuning/run_tdnn_lstm_1m.sh --mic ihm  --train-set train_cleaned  --gmm tri3_cleaned
 # cleanup + chain TDNN+fast-LSTM model + dropout
 %WER 19.9 | 13098 94476 | 83.0 9.7 7.3 2.9 19.9 51.7 | -0.059 | exp/ihm/chain_cleaned/tdnn_lstm1m_sp_bi_ld5/decode_dev/ascore_10/dev_hires.ctm.filt.sys
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
index 02680b92f30..50d8d5ad0b9 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -5,11 +5,14 @@
 #System               tdnn_lstm1i_sp_bi_ld5 tdnn_lstm1l_sp_bi_ld5
 #WER on dev        20.6      19.8
 #WER on eval        20.1      19.2
-#Final train prob      -0.045 -0.067
-#Final valid prob      -0.098 -0.098
-#Final train prob (xent)      -0.723  -0.916
-#Final valid prob (xent)      -1.04  -1.10
-
+#Final train prob      -0.044763 -0.0666221
+#Final valid prob     -0.0981107 -0.097616
+#Final train prob (xent)     -0.722765 -0.915559
+#Final valid prob (xent)      -1.03985  -1.09907
+
+# steps/info/chain_dir_info.pl exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/ exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/
+# exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.064->-0.059 xent:train/valid[58,88,final]=(-0.940,-0.739,-0.723/-1.14,-1.04,-1.04) logprob:train/valid[58,88,final]=(-0.067,-0.046,-0.045/-0.103,-0.099,-0.098)
+# exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.094->-0.082 xent:train/valid[58,88,final]=(-3.10,-1.11,-0.916/-3.17,-1.29,-1.10) logprob:train/valid[58,88,final]=(-0.164,-0.073,-0.067/-0.182,-0.104,-0.098)
 
 set -e -o pipefail
 
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
index 395a6dff483..f2244fdc1c8 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -6,10 +6,15 @@
 #System            tdnn_lstm1j_sp_bi_ld5 tdnn_lstm1m_sp_bi_ld5
 #WER on dev        20.8      19.9
 #WER on eval        20.3      19.3
-#Final train prob    -0.044   -0.065
-#Final valid prob     -0.107  -0.100
-#Final train prob (xent)      -0.684  -0.885
-#Final valid prob (xent)      -1.05  -1.09
+#Final train prob     -0.0439145 -0.0653269
+#Final valid prob       -0.10673 -0.0998743
+#Final train prob (xent)     -0.683776 -0.884698
+#Final valid prob (xent)      -1.05254  -1.09002
+
+# steps/info/chain_dir_info.pl exp/ihm/chain_cleaned/tdnn_lstm1j_sp_bi_ld5/ exp/ihm/chain_cleaned/tdnn_lstm1m_sp_bi_ld5/
+# exp/ihm/chain_cleaned/tdnn_lstm1j_sp_bi_ld5: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.063->-0.058 xent:train/valid[58,88,final]=(-0.888,-0.695,-0.684/-1.12,-1.06,-1.05) logprob:train/valid[58,88,final]=(-0.065,-0.045,-0.044/-0.105,-0.107,-0.107)
+# exp/ihm/chain_cleaned/tdnn_lstm1m_sp_bi_ld5: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.092->-0.080 xent:train/valid[58,88,final]=(-3.12,-1.09,-0.885/-3.20,-1.27,-1.09) logprob:train/valid[58,88,final]=(-0.164,-0.072,-0.065/-0.181,-0.103,-0.100)
+
 
 set -e -o pipefail
 

From d03be0ff2091f6d614d0cf2c5c08130f52c9de8a Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Wed, 12 Apr 2017 09:46:18 +0800
Subject: [PATCH 15/21] adding SDM results

---
 egs/ami/s5b/RESULTS_ihm                       |  8 +--
 egs/ami/s5b/RESULTS_sdm                       |  4 ++
 .../local/chain/tuning/run_tdnn_lstm_1i.sh    |  3 +-
 .../local/chain/tuning/run_tdnn_lstm_1j.sh    |  3 +-
 .../local/chain/tuning/run_tdnn_lstm_1l.sh    | 54 +++++++++++++++++-
 .../local/chain/tuning/run_tdnn_lstm_1m.sh    | 56 +++++++++++++++++--
 6 files changed, 115 insertions(+), 13 deletions(-)

diff --git a/egs/ami/s5b/RESULTS_ihm b/egs/ami/s5b/RESULTS_ihm
index 6438f64a6c9..bdd5a18b235 100644
--- a/egs/ami/s5b/RESULTS_ihm
+++ b/egs/ami/s5b/RESULTS_ihm
@@ -84,10 +84,10 @@
 %WER 20.8 | 13098 94489 | 82.0 10.0 8.0 2.8 20.8 53.2 | -0.096 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_dev/ascore_11/dev_hires.ctm.filt.sys
 %WER 20.7 | 12643 89980 | 81.7 11.5 6.8 2.5 20.7 51.8 | 0.015 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_eval/ascore_11/eval_hires.ctm.filt.sys
 
-# local/chain/tuning/run_tdnn_lstm_1m.sh --mic ihm  --train-set train_cleaned  --gmm tri3_cleaned
-# cleanup + chain TDNN+fast-LSTM model + dropout
-%WER 19.9 | 13098 94476 | 83.0 9.7 7.3 2.9 19.9 51.7 | -0.059 | exp/ihm/chain_cleaned/tdnn_lstm1m_sp_bi_ld5/decode_dev/ascore_10/dev_hires.ctm.filt.sys
-%WER 19.3 | 12643 89969 | 83.1 10.8 6.1 2.4 19.3 49.9 | 0.045 | exp/ihm/chain_cleaned/tdnn_lstm1m_sp_bi_ld5/decode_eval/ascore_10/eval_hires.ctm.filt.sys
+# local/chain/tuning/run_tdnn_lstm_1l.sh --mic ihm  --train-set train_cleaned  --gmm tri3_cleaned
+# cleanup + chain TDNN+LSTM model + per-frame dropout
+%WER 19.8 | 13098 94475 | 83.1 9.6 7.4 2.8 19.8 51.8 | -0.041 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_dev/ascore_10/dev_hires.ctm.filt.sys
+%WER 19.2 | 12643 89964 | 83.2 10.7 6.1 2.5 19.2 49.7 | 0.079 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_eval/ascore_10/eval_hires.ctm.filt.sys
 
 # local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh --mic ihm
 # cleanup + chain TDNN+LSTM model + IHM reverberated data
diff --git a/egs/ami/s5b/RESULTS_sdm b/egs/ami/s5b/RESULTS_sdm
index 05b68e5e780..9ed296f51b1 100644
--- a/egs/ami/s5b/RESULTS_sdm
+++ b/egs/ami/s5b/RESULTS_sdm
@@ -91,6 +91,10 @@
 %WER 37.6 | 15122 94495 | 66.1 18.7 15.1 3.7 37.6 63.2 | 0.646 | exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys
 %WER 40.9 | 13807 89961 | 62.4 20.0 17.6 3.3 40.9 65.7 | 0.612 | exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys
 
+# local/chain/tuning/run_tdnn_lstm_1l.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned
+# cleanup + chain TDNN+LSTM model, SDM audio + alignments from ihm data + per-frame dropout.
+%WER 35.9 | 14900 94497 | 67.8 18.2 14.1 3.7 35.9 62.5 | 0.647 | exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys
+%WER 39.4 | 13223 89946 | 64.1 19.7 16.2 3.5 39.4 67.0 | 0.611 | exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys
 
 # local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned 
 # cleanup + chain TDNN+LSTM model, SDM original + IHM reverberated data, alignments from ihm data.
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
index 3e3976ac7a8..92636b4c17e 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -26,6 +26,7 @@ gmm=tri3_cleaned  # the gmm for the target data
 ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
 num_threads_ubm=32
 nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+num_epochs=4
 
 chunk_width=150
 chunk_left_context=40
@@ -242,7 +243,7 @@ if [ $stage -le 16 ]; then
     --egs.chunk-right-context $chunk_right_context \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
+    --trainer.num-epochs $num_epochs \
     --trainer.optimization.shrink-value 0.99 \
     --trainer.optimization.num-jobs-initial 2 \
     --trainer.optimization.num-jobs-final 12 \
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
index 008060df070..a96230075b6 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -34,6 +34,7 @@ gmm=tri3_cleaned  # the gmm for the target data
 ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
 num_threads_ubm=32
 nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+num_epochs=4
 
 chunk_width=150
 chunk_left_context=40
@@ -254,7 +255,7 @@ if [ $stage -le 16 ]; then
     --egs.chunk-right-context-final 0 \
     --trainer.num-chunk-per-minibatch 64,32 \
     --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
+    --trainer.num-epochs $num_epochs \
     --trainer.optimization.shrink-value 0.99 \
     --trainer.optimization.num-jobs-initial 2 \
     --trainer.optimization.num-jobs-final 12 \
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
index 50d8d5ad0b9..eac59626a0f 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -1,7 +1,12 @@
 #!/bin/bash
 
-# same as 1i but with per-frame dropout on LSTM layer
-#IHM
+# This (1l.sh) is the same as 1j but with per-frame dropout on LSTM layer
+# It is a regular (non-fast) LSTM with per-frame dropout on [i, f, o] gates of the LSTM,
+# the dropout-adding place is "place4" in paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf.
+# We have tried both 4-epoch and 5-epoch training.
+
+### IHM
+# Results with flags : --mic ihm  --train-set train_cleaned  --gmm tri3_cleaned\
 #System               tdnn_lstm1i_sp_bi_ld5 tdnn_lstm1l_sp_bi_ld5
 #WER on dev        20.6      19.8
 #WER on eval        20.1      19.2
@@ -10,10 +15,53 @@
 #Final train prob (xent)     -0.722765 -0.915559
 #Final valid prob (xent)      -1.03985  -1.09907
 
-# steps/info/chain_dir_info.pl exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/ exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/
+# ./steps/info/chain_dir_info.pl exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/ exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/
 # exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.064->-0.059 xent:train/valid[58,88,final]=(-0.940,-0.739,-0.723/-1.14,-1.04,-1.04) logprob:train/valid[58,88,final]=(-0.067,-0.046,-0.045/-0.103,-0.099,-0.098)
 # exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.094->-0.082 xent:train/valid[58,88,final]=(-3.10,-1.11,-0.916/-3.17,-1.29,-1.10) logprob:train/valid[58,88,final]=(-0.164,-0.073,-0.067/-0.182,-0.104,-0.098)
 
+# Results with flags for (1l.sh) : --num-epochs 5 --tlstm-affix 1i_5epoch --mic ihm  --train-set train_cleaned  --gmm tri3_cleaned\
+# Results with flags for (1i.sh) : --num-epochs 5 --tlstm-affix 1l_5epoch --mic ihm  --train-set train_cleaned  --gmm tri3_cleaned\
+#System            tdnn_lstm1i_5epoch_sp_bi_ld5 tdnn_lstm1l_5epoch_sp_bi_ld5
+#WER on dev        20.8      19.7
+#WER on eval        20.6      19.3
+#Final train prob     -0.0347795-0.0600903
+#Final valid prob      -0.102486-0.0964607
+#Final train prob (xent)     -0.621007  -0.84667
+#Final valid prob (xent)      -1.02634  -1.04725
+
+# ./steps/info/chain_dir_info.pl exp/ihm/chain_cleaned/tdnn_lstm1i_5epoch_sp_bi_ld5/ exp/ihm/chain_cleaned/tdnn_lstm1l_5epoch_sp_bi_ld5/
+# exp/ihm/chain_cleaned/tdnn_lstm1i_5epoch_sp_bi_ld5/: num-iters=111 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.053->-0.049 xent:train/valid[73,110,final]=(-0.832,-0.631,-0.621/-1.09,-1.03,-1.03) logprob:train/valid[73,110,final]=(-0.057,-0.037,-0.035/-0.102,-0.103,-0.102)
+# exp/ihm/chain_cleaned/tdnn_lstm1l_5epoch_sp_bi_ld5/: num-iters=111 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.085->-0.074 xent:train/valid[73,110,final]=(-3.14,-1.02,-0.847/-3.20,-1.21,-1.05) logprob:train/valid[73,110,final]=(-0.162,-0.065,-0.060/-0.177,-0.101,-0.096)
+
+### SDM
+# Results with flags : --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned \
+#System            tdnn_lstm1i_sp_bi_ihmali_ld5 tdnn_lstm1l_sp_bi_ihmali_ld5
+#WER on dev        37.0      35.9
+#WER on eval        40.0      39.4
+#Final train prob      -0.106971  -0.15439
+#Final valid prob      -0.252201 -0.244499
+#Final train prob (xent)      -1.41142  -1.73795
+#Final valid prob (xent)      -2.13741  -2.14519
+
+# ./steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/ exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/
+# exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.138->-0.128 xent:train/valid[57,86,final]=(-1.78,-1.42,-1.41/-2.23,-2.14,-2.14) logprob:train/valid[57,86,final]=(-0.155,-0.108,-0.107/-0.251,-0.254,-0.252)
+# exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.192->-0.174 xent:train/valid[57,86,final]=(-3.74,-1.95,-1.74/-3.86,-2.31,-2.15) logprob:train/valid[57,86,final]=(-0.287,-0.165,-0.154/-0.335,-0.250,-0.244)
+
+# Results with flags for (1i.sh) : --num-epochs 5 --tlstm-affix 1i_5epoch --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned\
+# Results with flags for (1l.sh) : --num-epochs 5 --tlstm-affix 1l_5epoch --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned\
+#System            tdnn_lstm1i_5epoch_sp_bi_ihmali_ld5 tdnn_lstm1l_5epoch_sp_bi_ihmali_ld5
+#WER on dev        36.9      35.8
+#WER on eval        40.2      39.5
+#Final train prob     -0.0854552 -0.134189
+#Final valid prob      -0.262789 -0.244183
+#inal train prob (xent)       -1.2195  -1.58789
+#Final valid prob (xent)      -2.13389  -2.08964
+
+# ./steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn_lstm1i_5epoch_sp_bi_ihmali_ld5 exp/sdm1/chain_cleaned/tdnn_lstm1l_5epoch_sp_bi_ihmali_ld5/
+# exp/sdm1/chain_cleaned/tdnn_lstm1i_5epoch_sp_bi_ihmali_ld5: num-iters=109 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.111->-0.104 xent:train/valid[71,108,final]=(-1.61,-1.25,-1.22/-2.16,-2.15,-2.13) logprob:train/valid[71,108,final]=(-0.133,-0.089,-0.085/-0.246,-0.264,-0.263)
+# exp/sdm1/chain_cleaned/tdnn_lstm1l_5epoch_sp_bi_ihmali_ld5/: num-iters=109 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.170->-0.153 xent:train/valid[71,108,final]=(-3.67,-1.76,-1.59/-3.81,-2.22,-2.09) logprob:train/valid[71,108,final]=(-0.274,-0.144,-0.134/-0.327,-0.248,-0.244)
+
+
 set -e -o pipefail
 
 # First the options that are passed through to run_ivector_common.sh
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
index f2244fdc1c8..b0e7af0618d 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -1,8 +1,12 @@
 #!/bin/bash
 
-# 1m is same as 1j but with the by-frame dropout fast-lstmp
+# This (1m.sh) is the same as 1j but with per-frame dropout on LSTM layer
+# It is a fast LSTM with per-frame dropout on [i, f, o] gates of the LSTM,
+# the dropout-adding place is "place4" in paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf.
+# We have tried both 4-epoch and 5-epoch training.
 
-#IHM
+### IHM
+# Results with flags : --mic ihm  --train-set train_cleaned  --gmm tri3_cleaned \
 #System            tdnn_lstm1j_sp_bi_ld5 tdnn_lstm1m_sp_bi_ld5
 #WER on dev        20.8      19.9
 #WER on eval        20.3      19.3
@@ -15,6 +19,48 @@
 # exp/ihm/chain_cleaned/tdnn_lstm1j_sp_bi_ld5: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.063->-0.058 xent:train/valid[58,88,final]=(-0.888,-0.695,-0.684/-1.12,-1.06,-1.05) logprob:train/valid[58,88,final]=(-0.065,-0.045,-0.044/-0.105,-0.107,-0.107)
 # exp/ihm/chain_cleaned/tdnn_lstm1m_sp_bi_ld5: num-iters=89 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.092->-0.080 xent:train/valid[58,88,final]=(-3.12,-1.09,-0.885/-3.20,-1.27,-1.09) logprob:train/valid[58,88,final]=(-0.164,-0.072,-0.065/-0.181,-0.103,-0.100)
 
+# Results with flags for (1m.sh) : --num-epochs 5 --tlstm-affix 1m_5epoch --mic ihm  --train-set train_cleaned  --gmm tri3_cleaned \
+# Results with flags for (1j.sh) : --num-epochs 5 --tlstm-affix 1j_5epoch --mic ihm  --train-set train_cleaned  --gmm tri3_cleaned \
+#System            tdnn_lstm1j_5epoch_sp_bi_ld5 tdnn_lstm1m_5epoch_sp_bi_ld5
+#WER on dev        21.1      19.9
+#WER on eval        20.9      19.8
+#Final train prob     -0.0365079 -0.057024
+#Final valid prob      -0.112709-0.0992725
+#inal train prob (xent)     -0.601602 -0.800653
+#Final valid prob (xent)      -1.03241  -1.04748
+
+# ./steps/info/chain_dir_info.pl exp/ihm/chain_cleaned/tdnn_lstm1j_5epoch_sp_bi_ld5/ exp/ihm/chain_cleaned/tdnn_lstm1m_5epoch_sp_bi_ld5/
+# exp/ihm/chain_cleaned/tdnn_lstm1j_5epoch_sp_bi_ld5/: num-iters=111 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.053->-0.049 xent:train/valid[73,110,final]=(-0.813,-0.615,-0.602/-1.08,-1.04,-1.03) logprob:train/valid[73,110,final]=(-0.057,-0.038,-0.037/-0.106,-0.113,-0.113)
+# exp/ihm/chain_cleaned/tdnn_lstm1m_5epoch_sp_bi_ld5/: num-iters=111 nj=2..12 num-params=43.4M dim=40+100->3765 combine=-0.080->-0.072 xent:train/valid[73,110,final]=(-3.15,-0.985,-0.801/-3.26,-1.21,-1.05) logprob:train/valid[73,110,final]=(-0.161,-0.062,-0.057/-0.183,-0.102,-0.099)
+
+#### SDM
+# Results with flags : --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned \
+#System            tdnn_lstm1j_sp_bi_ihmali_ld5 tdnn_lstm1m_sp_bi_ihmali_ld5
+#WER on dev        36.9      36.4
+#WER on eval        40.5      39.9
+#Final train prob      -0.108141 -0.148861
+#Final valid prob      -0.257468 -0.240962
+#Final train prob (xent)      -1.38179  -1.70258
+#Final valid prob (xent)      -2.13095  -2.12803
+
+# ./steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn_lstm1j_sp_bi_ihmali_ld5/ exp/sdm1/chain_cleaned/tdnn_lstm1m_sp_bi_ihmali_ld5/
+# exp/sdm1/chain_cleaned/tdnn_lstm1j_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.138->-0.128 xent:train/valid[57,86,final]=(-1.71,-1.39,-1.38/-2.18,-2.14,-2.13) logprob:train/valid[57,86,final]=(-0.150,-0.110,-0.108/-0.251,-0.260,-0.257)
+# exp/sdm1/chain_cleaned/tdnn_lstm1m_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.187->-0.170 xent:train/valid[57,86,final]=(-3.74,-1.90,-1.70/-3.88,-2.28,-2.13) logprob:train/valid[57,86,final]=(-0.286,-0.158,-0.149/-0.336,-0.245,-0.241)
+
+# Results with flags for (1m.sh) : --num-epochs 5 --tlstm-affix 1m_5epoch --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned\
+# Results with flags for (1j.sh) : --num-epochs 5 --tlstm-affix 1j_5epoch --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned\
+#System            tdnn_lstm1j_5epoch_sp_bi_ihmali_ld5 tdnn_lstm1m_5epoch_sp_bi_ihmali_ld5
+#WER on dev        37.4      36.0
+#WER on eval        40.7      39.6
+#Final train prob     -0.0879063 -0.133092
+#Final valid prob      -0.270953 -0.243246
+#Final train prob (xent)      -1.20822  -1.56293
+#Final valid prob (xent)       -2.1425  -2.07265
+
+# ./steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn_lstm1j_5epoch_sp_bi_ihmali_ld5/ exp/sdm1/chain_cleaned/tdnn_lstm1m_5epoch_sp_bi_ihmali_ld5/
+# exp/sdm1/chain_cleaned/tdnn_lstm1j_5epoch_sp_bi_ihmali_ld5/: num-iters=109 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.115->-0.107 xent:train/valid[71,108,final]=(-1.56,-1.22,-1.21/-2.16,-2.16,-2.14) logprob:train/valid[71,108,final]=(-0.131,-0.090,-0.088/-0.256,-0.273,-0.271)
+# exp/sdm1/chain_cleaned/tdnn_lstm1m_5epoch_sp_bi_ihmali_ld5/: num-iters=109 nj=2..12 num-params=43.4M dim=40+100->3741 combine=-0.167->-0.153 xent:train/valid[71,108,final]=(-3.69,-1.71,-1.56/-3.84,-2.20,-2.07) logprob:train/valid[71,108,final]=(-0.279,-0.140,-0.133/-0.329,-0.247,-0.243)
+
 
 set -e -o pipefail
 
@@ -30,7 +76,9 @@ gmm=tri3_cleaned  # the gmm for the target data
 ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
 num_threads_ubm=32
 nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-dropout_schedule='0,0@0.20,0.3@0.50,0'
+dropout_schedule='0,0@0.20,0.3@0.50,0' # dropout schedule controls the dropout
+                                       # proportion for each training iteration.
+num_epochs=4
 
 chunk_width=150
 chunk_left_context=40
@@ -252,7 +300,7 @@ if [ $stage -le 16 ]; then
     --trainer.dropout-schedule $dropout_schedule \
     --trainer.num-chunk-per-minibatch 64,32 \
     --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
+    --trainer.num-epochs $num_epochs \
     --trainer.optimization.shrink-value 0.99 \
     --trainer.optimization.num-jobs-initial 2 \
     --trainer.optimization.num-jobs-final 12 \

From 936863ee456364bc90e5cd904a3a31adbc83cd56 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Mon, 17 Apr 2017 09:31:41 +0800
Subject: [PATCH 16/21] adding SWBD (parts of all) scripts with dropout

---
 .../s5c/local/chain/tuning/run_blstm_6l.sh    | 247 ++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1k.sh    | 321 ++++++++++++++++++
 2 files changed, 568 insertions(+)
 create mode 100644 egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh
 create mode 100644 egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh

diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh
new file mode 100644
index 00000000000..e577f96a58f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh
@@ -0,0 +1,247 @@
+#!/bin/bash
+
+# 6l is same as 6k, but with the per-frame dropout
+# location4 as paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf
+
+# local/chain/compare_wer_general.sh blstm_6k_sp blstm_6l_sp
+# result (14.5 vs 14.1), this may due to noise
+
+# System                blstm_6k_sp blstm_6l_sp
+# WER on train_dev(tg)      13.30     13.06
+# WER on train_dev(fg)      12.34     12.16
+# WER on eval2000(tg)        15.5      15.2
+# WER on eval2000(fg)        14.1      13.8
+# Final train prob         -0.052    -0.065
+# Final valid prob         -0.090    -0.093
+# Final train prob (xent)        -0.743    -0.831
+# Final valid prob (xent)       -0.9579   -0.9821
+
+# exp/chain/blstm_6k_sp/: num-iters=327 nj=3..16 num-params=41.2M dim=40+100->6074 combine=-0.069->-0.069 xent:train/valid[217,326,final]=(-0.849,-0.748,-0.743/-1.04,-0.959,-0.958) logprob:train/valid[217,326,final]=(-0.065,-0.053,-0.052/-0.096,-0.090,-0.090)
+# exp/chain/blstm_6l_sp/: num-iters=327 nj=3..16 num-params=41.2M dim=40+100->6074 combine=-0.084->-0.082 xent:train/valid[217,326,final]=(-1.45,-0.840,-0.831/-1.58,-0.994,-0.982) logprob:train/valid[217,326,final]=(-0.110,-0.066,-0.065/-0.132,-0.094,-0.093)
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/blstm_6l  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=40
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=0
+dropout_schedule='0,0@0.20,0.1@0.50,0'
+
+# decode options
+extra_left_context=50
+extra_right_context=50
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20 dropout-proportion=0.0"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=blstm1-forward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=blstm1-backward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts
+
+  fast-lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts
+
+  fast-lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --trainer.dropout-schedule $dropout_schedule \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh
new file mode 100644
index 00000000000..21cb4fa9373
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh
@@ -0,0 +1,321 @@
+#!/bin/bash
+
+# run_tdnn_lstm_1k.sh is like run_tdnn_lstm_1e.sh but
+# added the per-frame dropout location 4 as paper:
+# http://www.danielpovey.com/files/2017_interspeech_dropout.pdf
+
+# ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1e_sp tdnn_lstm_1k_sp
+# System                tdnn_lstm_1e_sp tdnn_lstm_1k_sp
+# WER on train_dev(tg)      13.18     12.60
+#           [looped:]       13.10     12.56
+# WER on train_dev(fg)      12.21     11.58
+#           [looped:]       12.28     11.62
+# WER on eval2000(tg)        15.8      15.2
+#           [looped:]        15.8      15.2
+# WER on eval2000(fg)        14.5      13.7
+#           [looped:]        14.5      13.8
+# Final train prob         -0.060    -0.076
+# Final valid prob         -0.101    -0.106
+# Final train prob (xent)        -0.868    -0.989
+# Final valid prob (xent)       -1.0740   -1.1341
+
+# exp/chain/tdnn_lstm_1e_sp/: num-iters=262 nj=3..16 num-params=39.6M dim=40+100->6074 combine=-0.072->-0.071 xent:train/valid[173,261,final]=(-1.01,-0.876,-0.868/-1.16,-1.08,-1.07) logprob:train/valid[173,261,final]=(-0.075,-0.061,-0.060/-0.106,-0.101,-0.101)
+# exp/chain/tdnn_lstm_1k_sp/: num-iters=262 nj=3..16 num-params=39.6M dim=40+100->6074 combine=-0.093->-0.089 xent:train/valid[173,261,final]=(-2.87,-1.07,-0.989/-2.90,-1.20,-1.13) logprob:train/valid[173,261,final]=(-0.153,-0.079,-0.076/-0.179,-0.107,-0.106)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1e # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_nj=50
+
+# training options
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+      (
+        steps/nnet3/decode.sh --num-threads 4 \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 25 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if [ $stage -le 16 ]; then
+  # looped decoding.  Note: this does not make sense for BLSTMs or other
+  # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
+  # point doing it because it would give identical results to regular decoding.
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in looped decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+exit 0;

From f51fb75dd76dca6e1ca6cdbf0a18c78967fcdc64 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Mon, 17 Apr 2017 09:31:56 +0800
Subject: [PATCH 17/21] small fix

---
 egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh
index e577f96a58f..68daf81ab01 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh
@@ -4,6 +4,7 @@
 # location4 as paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf
 
 # local/chain/compare_wer_general.sh blstm_6k_sp blstm_6l_sp
+# attention: the blatm_6k_sp result here is far better than the updated
 # result (14.5 vs 14.1), this may due to noise
 
 # System                blstm_6k_sp blstm_6l_sp

From 139f412fcb7b0df71065458e22a16558a0184529 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Tue, 18 Apr 2017 11:19:39 +0800
Subject: [PATCH 18/21] update tdnn-blstm with dropout in SWBD

---
 .../local/chain/tuning/run_tdnn_blstm_1b.sh   | 248 ++++++++++++++++++
 1 file changed, 248 insertions(+)
 create mode 100644 egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh
new file mode 100644
index 00000000000..3929cdc432e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh
@@ -0,0 +1,248 @@
+#!/bin/bash
+
+# tdnn_blstm_1b is same as tdnn_blstm_1a, but with the per-frame dropout
+# added with location 4, see paper:
+# http://www.danielpovey.com/files/2017_interspeech_dropout.pdf
+
+# ./local/chain/compare_wer_general.sh tdnn_blstm_1a_sp tdnn_blstm_1b_sp
+# System                tdnn_blstm_1a_sp tdnn_blstm_1b_sp
+# WER on train_dev(tg)      12.86     12.60
+# WER on train_dev(fg)      11.86     11.80
+# WER on eval2000(tg)        15.3      14.9
+# WER on eval2000(fg)        14.0      13.5
+# Final train prob         -0.042    -0.054
+# Final valid prob         -0.099    -0.091
+# Final train prob (xent)        -0.637    -0.719
+# Final valid prob (xent)       -0.9418   -0.9190
+
+# exp/chain/tdnn_blstm_1a_sp/: num-iters=327 nj=3..16 num-params=53.7M dim=40+100->6074 combine=-0.058->-0.057 xent:train/valid[217,326,final]=(-0.753,-0.631,-0.637/-0.974,-0.941,-0.942) logprob:train/valid[217,326,final]=(-0.055,-0.041,-0.042/-0.094,-0.099,-0.099)
+# exp/chain/tdnn_blstm_1b_sp/: num-iters=327 nj=3..16 num-params=53.7M dim=40+100->6074 combine=-0.070->-0.068 xent:train/valid[217,326,final]=(-1.27,-0.732,-0.719/-1.42,-0.931,-0.919) logprob:train/valid[217,326,final]=(-0.094,-0.055,-0.054/-0.117,-0.091,-0.091)
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_blstm_1b  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=40
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=0
+dropout_schedule='0,0@0.20,0.1@0.50,0'
+# decode options
+extra_left_context=50
+extra_right_context=50
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20 dropout-proportion=0.0"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=blstm1-forward input=tdnn3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=blstm1-backward input=tdnn3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts
+
+  fast-lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts
+
+  fast-lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --trainer.dropout-schedule $dropout_schedule \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;

From 9a8b81cb839b2ac8f99fe4bd6fbc17a6e8bc1eff Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Tue, 18 Apr 2017 12:07:55 +0800
Subject: [PATCH 19/21] update tdnn+regular-LSTM(4epoch) in SWBD

5epoch is on the way
---
 .../local/chain/tuning/run_tdnn_lstm_1l.sh    | 244 ++++++++++++++++++
 1 file changed, 244 insertions(+)
 create mode 100644 egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh
new file mode 100644
index 00000000000..e88e199839c
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -0,0 +1,244 @@
+#!/bin/bash
+
+# tdnn_lstm_1l is same as tdnn_lstm_1b, but with the per-frame dropout
+# added with location 4 in LSTM layer, see paper:
+# http://www.danielpovey.com/files/2017_interspeech_dropout.pdf
+
+# ./local/chain/compare_wer_general.sh tdnn_lstm_1b_ld5_sp tdnn_lstm_1l_ld5_sp
+# System                tdnn_lstm_1b_ld5_sp tdnn_lstm_1l_ld5_sp
+# WER on train_dev(tg)      13.06     12.41
+# WER on train_dev(fg)      12.13     11.59
+# WER on eval2000(tg)        15.1      14.8
+# WER on eval2000(fg)        13.9      13.5
+# Final train prob         -0.047    -0.069
+# Final valid prob         -0.093    -0.095
+# Final train prob (xent)        -0.735    -0.913
+# Final valid prob (xent)       -1.0151   -1.0820
+
+# exp/chain/tdnn_lstm_1b_ld5_sp: num-iters=327 nj=3..16 num-params=39.6M dim=40+100->6074 combine=-0.062->-0.061 xent:train/valid[217,326,final]=(-0.877,-0.741,-0.735/-1.08,-1.02,-1.02) logprob:train/valid[217,326,final]=(-0.063,-0.048,-0.047/-0.095,-0.093,-0.093)
+# exp/chain/tdnn_lstm_1l_ld5_sp: num-iters=327 nj=3..16 num-params=39.6M dim=40+100->6074 combine=-0.088->-0.084 xent:train/valid[217,326,final]=(-3.32,-0.961,-0.913/-3.40,-1.13,-1.08) logprob:train/valid[217,326,final]=(-0.176,-0.072,-0.069/-0.198,-0.097,-0.095)
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1l # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=true
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=true
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 dropout-per-frame=true
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --trainer.dropout-schedule $dropout_schedule \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+       steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;

From 48f41a7a399a16c7e778605f693ade507d7eae6c Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Thu, 20 Apr 2017 09:32:07 +0800
Subject: [PATCH 20/21] adding tedlium scripts

also SWBD RESULTS updated
---
 egs/swbd/s5c/RESULTS                          |   6 +
 .../local/chain/tuning/run_tdnn_lstm_1s.sh    | 333 ++++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1t.sh    | 333 ++++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1u.sh    | 327 +++++++++++++++++
 4 files changed, 999 insertions(+)
 create mode 100644 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
 create mode 100644 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
 create mode 100644 egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh

diff --git a/egs/swbd/s5c/RESULTS b/egs/swbd/s5c/RESULTS
index f103200f966..2cf34c600c1 100644
--- a/egs/swbd/s5c/RESULTS
+++ b/egs/swbd/s5c/RESULTS
@@ -203,6 +203,12 @@ exit 0
 %WER 21.2 | 2628 21594 | 81.4 12.8 5.9 2.6 21.2 56.7 | exp/chain/lstm_d_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
 %WER 13.88 [ 6829 / 49204, 935 ins, 1690 del, 4204 sub ] exp/chain/lstm_d_ld5_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
 
+# current best 'chain' models with TDNN + LSTM + dropout (see local/chain/run_tdnn_lstm_1l.sh)
+%WER 13.5 | 4459 42989 | 88.2 8.0 3.8 1.7 13.5 48.2 | exp/chain/tdnn_lstm_1b_dropout_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 8.8 | 1831 21395 | 92.3 5.2 2.5 1.1 8.8 41.9 | exp/chain/tdnn_lstm_1b_dropout_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 18.1 | 2628 21594 | 84.0 10.8 5.2 2.2 18.1 52.6 | exp/chain/tdnn_lstm_1b_dropout_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 11.59 [ 5615 / 48460, 708 ins, 1450 del, 3457 sub ] exp/chain/tdnn_lstm_1b_dropout_ld5_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+
 # these are results with nnet3 LSTMs with CTC training : local/ctc/run_lstm.sh
 %WER 17.4 | 1831 21395 | 85.3 10.1 4.6 2.7 17.4 57.8 | exp/ctc/lstm_sp/decode_eval2000_sw1_fsh_fg_0.15/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys
 %WER 19.4 | 1831 21395 | 83.5 11.2 5.2 3.0 19.4 60.7 | exp/ctc/lstm_sp/decode_eval2000_sw1_tg_0.15/score_12_0.5/eval2000_hires.ctm.swbd.filt.sys
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
new file mode 100644
index 00000000000..dc0f59fb64a
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
@@ -0,0 +1,333 @@
+#!/bin/bash
+
+# 1s is as 1e, but adding per-frame dropout to LSTM in location4
+# as paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf
+
+# ./local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1s_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1s_sp_bi
+# WER on dev(orig)            9.0       8.9
+#         [looped:]           9.0       8.9
+# WER on dev(rescored)        8.4       8.1
+#         [looped:]           8.4       8.1
+# WER on test(orig)           8.9       8.8
+#         [looped:]           8.9       8.8
+# WER on test(rescored)       8.4       8.4
+#         [looped:]           8.4       8.3
+# Final train prob        -0.0712   -0.0914
+# Final valid prob        -0.0892   -0.0977
+# Final train prob (xent)   -0.8566   -0.9931
+# Final valid prob (xent)   -0.9927   -1.0633
+
+# exp/chain_cleaned/tdnn_lstm1e_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3626 combine=-0.082->-0.081 xent:train/valid[167,252,final]=(-0.961,-0.859,-0.857/-1.06,-0.999,-0.993) logprob:train/valid[167,252,final]=(-0.086,-0.072,-0.071/-0.098,-0.091,-0.089)
+# exp/chain_cleaned/tdnn_lstm1s_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3626 combine=-0.104->-0.101 xent:train/valid[167,252,final]=(-3.08,-1.07,-0.993/-3.13,-1.14,-1.06) logprob:train/valid[167,252,final]=(-0.181,-0.093,-0.091/-0.183,-0.100,-0.098)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+frames_per_chunk=140,100,160
+dropout_schedule="0,0@0.2,0.3@0.5,0"
+# decode options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1s  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=    # you can set this to use previously dumped egs.
+remove_egs=true
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 dropout-proportion=0.0 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 dropout-proportion=0.0 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 dropout-proportion=0.0 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs "$remove_egs" \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results very much (unlike
+  # regular decoding)... [it will affect them slightly due to differences in the
+  # iVector extraction; probably smaller will be worse as it sees less of the future,
+  # but in a real scenario, long chunks will introduce excessive latency].
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if $test_online_decoding && [ $stage -le 22 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       data/lang_chain exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+    (
+      # note: we just give it "$dset" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --extra-left-context-initial $extra_left_context_initial \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset} ${dir}_online/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}_online/decode_${dset} ${dir}_online/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
new file mode 100644
index 00000000000..c286fcef353
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
@@ -0,0 +1,333 @@
+#!/bin/bash
+
+# 1t is as 1e, but increasing the TDNN dim and LSTM cell-dim into
+# 1024, the recurrent and non-recurrent projection of the LSTM from
+# 128 into 256.
+
+# ./local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1t_sp_bi
+# System                tdnn_lstm1e_again_sp_bi tdnn_lstm1t_again_sp_bi
+# WER on dev(orig)            9.0       8.9
+#         [looped:]           9.0       8.9
+# WER on dev(rescored)        8.4       8.2
+#         [looped:]           8.4       8.3
+# WER on test(orig)           8.9       8.9
+#         [looped:]           8.9       9.0
+# WER on test(rescored)       8.4       8.4
+#         [looped:]           8.4       8.5
+# Final train prob        -0.0712   -0.0459
+# Final valid prob        -0.0892   -0.0867
+# Final train prob (xent)   -0.8566   -0.6434
+# Final valid prob (xent)   -0.9927   -0.8733
+
+# exp/chain_cleaned/tdnn_lstm1e_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3626 combine=-0.082->-0.081 xent:train/valid[167,252,final]=(-0.961,-0.859,-0.857/-1.06,-0.999,-0.993) logprob:train/valid[167,252,final]=(-0.086,-0.072,-0.071/-0.098,-0.091,-0.089)
+# exp/chain_cleaned/tdnn_lstm1t_sp_bi: num-iters=253 nj=2..12 num-params=37.1M dim=40+100->3626 combine=-0.055->-0.055 xent:train/valid[167,252,final]=(-0.774,-0.655,-0.643/-0.928,-0.883,-0.873) logprob:train/valid[167,252,final]=(-0.063,-0.048,-0.046/-0.087,-0.089,-0.087)
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+frames_per_chunk=140,100,160
+# decode options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1t  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=    # you can set this to use previously dumped egs.
+remove_egs=true
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 dim=1024 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=1024 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=1024 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=1024 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=1024 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 decay-time=20 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs "$remove_egs" \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results very much (unlike
+  # regular decoding)... [it will affect them slightly due to differences in the
+  # iVector extraction; probably smaller will be worse as it sees less of the future,
+  # but in a real scenario, long chunks will introduce excessive latency].
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if $test_online_decoding && [ $stage -le 22 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       data/lang_chain exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+    (
+      # note: we just give it "$dset" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --extra-left-context-initial $extra_left_context_initial \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset} ${dir}_online/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}_online/decode_${dset} ${dir}_online/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
new file mode 100644
index 00000000000..9e50060f5d6
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
@@ -0,0 +1,327 @@
+#!/bin/bash
+
+# 1u is the same as 1t but adding per-frame dropout to LSTM
+# in location4, see paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf
+
+# ./local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1t_sp_bi exp/chain_cleaned/tdnn_lstm1u_sp_bi
+# System                tdnn_lstm1t_again_sp_bi tdnn_lstm1u_sp_bi
+# WER on dev(orig)            8.9       8.6
+# WER on dev(rescored)        8.2       8.0
+# WER on test(orig)           8.9       8.3
+# WER on test(rescored)       8.4       7.9
+# Final train prob        -0.0459   -0.0709
+# Final valid prob        -0.0867   -0.0902
+# Final train prob (xent)   -0.6434   -0.8112
+# Final valid prob (xent)   -0.8733   -0.9384
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+frames_per_chunk=140,100,160
+dropout_schedule="0,0@0.20,0.3@0.5,0"
+# decode options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1u  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=    # you can set this to use previously dumped egs.
+remove_egs=true
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 dim=1024 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 dropout-proportion=0.0 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=1024 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=1024 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 dropout-proportion=0.0 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=1024 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=1024 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 dropout-proportion=0.0 decay-time=20 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.dropout-schedule="$dropout_schedule" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs "$remove_egs" \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results very much (unlike
+  # regular decoding)... [it will affect them slightly due to differences in the
+  # iVector extraction; probably smaller will be worse as it sees less of the future,
+  # but in a real scenario, long chunks will introduce excessive latency].
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if $test_online_decoding && [ $stage -le 22 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       data/lang_chain exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+    (
+      # note: we just give it "$dset" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --extra-left-context-initial $extra_left_context_initial \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset} ${dir}_online/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}_online/decode_${dset} ${dir}_online/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+exit 0

From 62fee2b5e0a579f5afa8f786e16262993eb782e3 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Thu, 20 Apr 2017 09:56:14 +0800
Subject: [PATCH 21/21] small fix

---
 egs/ami/s5b/RESULTS_ihm                            | 1 +
 egs/ami/s5b/RESULTS_sdm                            | 1 +
 egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/egs/ami/s5b/RESULTS_ihm b/egs/ami/s5b/RESULTS_ihm
index bdd5a18b235..a2b5d0c3a5c 100644
--- a/egs/ami/s5b/RESULTS_ihm
+++ b/egs/ami/s5b/RESULTS_ihm
@@ -85,6 +85,7 @@
 %WER 20.7 | 12643 89980 | 81.7 11.5 6.8 2.5 20.7 51.8 | 0.015 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_eval/ascore_11/eval_hires.ctm.filt.sys
 
 # local/chain/tuning/run_tdnn_lstm_1l.sh --mic ihm  --train-set train_cleaned  --gmm tri3_cleaned
+# same as local/chain/tuning/run_tdnn_lstm_1i.sh, except that dropout is adopted
 # cleanup + chain TDNN+LSTM model + per-frame dropout
 %WER 19.8 | 13098 94475 | 83.1 9.6 7.4 2.8 19.8 51.8 | -0.041 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_dev/ascore_10/dev_hires.ctm.filt.sys
 %WER 19.2 | 12643 89964 | 83.2 10.7 6.1 2.5 19.2 49.7 | 0.079 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_eval/ascore_10/eval_hires.ctm.filt.sys
diff --git a/egs/ami/s5b/RESULTS_sdm b/egs/ami/s5b/RESULTS_sdm
index 9ed296f51b1..bbe0ba3aa12 100644
--- a/egs/ami/s5b/RESULTS_sdm
+++ b/egs/ami/s5b/RESULTS_sdm
@@ -92,6 +92,7 @@
 %WER 40.9 | 13807 89961 | 62.4 20.0 17.6 3.3 40.9 65.7 | 0.612 | exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys
 
 # local/chain/tuning/run_tdnn_lstm_1l.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned
+# same as local/chain/tuning/run_tdnn_lstm_1i.sh, except that dropout is adopted
 # cleanup + chain TDNN+LSTM model, SDM audio + alignments from ihm data + per-frame dropout.
 %WER 35.9 | 14900 94497 | 67.8 18.2 14.1 3.7 35.9 62.5 | 0.647 | exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys
 %WER 39.4 | 13223 89946 | 64.1 19.7 16.2 3.5 39.4 67.0 | 0.611 | exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
index eac59626a0f..74c0f5a6ead 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# This (1l.sh) is the same as 1j but with per-frame dropout on LSTM layer
+# This (1l.sh) is the same as 1i but with per-frame dropout on LSTM layer
 # It is a regular (non-fast) LSTM with per-frame dropout on [i, f, o] gates of the LSTM,
 # the dropout-adding place is "place4" in paper : http://www.danielpovey.com/files/2017_interspeech_dropout.pdf.
 # We have tried both 4-epoch and 5-epoch training.