From 7f3d44e8e5c872733d700d82339019dd1615398e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Fri, 4 May 2018 16:16:15 +0200
Subject: [PATCH 01/35] initial commit run.sh s5_r3 WIP

---
 egs/tedlium/s5_r3/run.sh | 222 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 222 insertions(+)
 create mode 100755 egs/tedlium/s5_r3/run.sh
diff --git a/egs/tedlium/s5_r3/run.sh b/egs/tedlium/s5_r3/run.sh
new file mode 100755
index 00000000000..7147476fe52
--- /dev/null
+++ b/egs/tedlium/s5_r3/run.sh
@@ -0,0 +1,222 @@
+#!/bin/bash
+#
+# Based mostly on the Switchboard recipe. The training database is TED-LIUM,
+# it consists of TED talks with cleaned automatic transcripts:
+#
+# http://www-lium.univ-lemans.fr/en/content/ted-lium-corpus
+# http://www.openslr.org/resources (Mirror).
+#
+# The data is distributed under 'Creative Commons BY-NC-ND 3.0' license,
+# which allow free non-commercial use, while only a citation is required.
+#
+# Copyright  2014  Nickolay V. Shmyrev
+#            2014  Brno University of Technology (Author: Karel Vesely)
+#            2016  Vincent Nguyen
+#            2016  Johns Hopkins University (Author: Daniel Povey)
+#            2018  François Hernandez
+#
+# Apache 2.0
+#
+
+. ./cmd.sh
+. ./path.sh
+
+
+set -e -o pipefail -u
+
+nj=35
+decode_nj=30   # note: should not be >38 which is the number of speakers in the dev set
+               # after applying --seconds-per-spk-max 180.  We decode with 4 threads, so
+               # this will be too many jobs if you're using run.pl.
+stage=0
+train_rnnlm=true
+
+. utils/parse_options.sh # accept options
+
+# Data preparation
+if [ $stage -le 0 ]; then
+  local/download_data.sh
+fi
+
+if [ $stage -le 1 ]; then
+  local/prepare_data.sh
+  # Split speakers up into 3-minute chunks.  This doesn't hurt adaptation, and
+  # lets us use more jobs for decoding etc.
+  # [we chose 3 minutes because that gives us 38 speakers for the dev data, which is
+  #  more than our normal 30 jobs.]
+  for dset in dev test train; do
+    utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}.orig data/${dset}
+  done
+fi
+
+
+if [ $stage -le 2 ]; then
+  local/prepare_dict.sh
+fi
+
+if [ $stage -le 3 ]; then
+  utils/prepare_lang.sh data/local/dict_nosp \
+    "<unk>" data/local/lang_nosp data/lang_nosp
+fi
+
+if [ $stage -le 4 ]; then
+  # later on we'll change this script so you have the option to
+  # download the pre-built LMs from openslr.org instead of building them
+  # locally.
+  local/ted_train_lm.sh
+fi
+
+if [ $stage -le 5 ]; then
+  local/format_lms.sh
+fi
+
+# Feature extraction
+if [ $stage -le 6 ]; then
+  for set in test dev train; do
+    dir=data/$set
+    steps/make_mfcc.sh --nj 30 --cmd "$train_cmd" $dir
+    steps/compute_cmvn_stats.sh $dir
+  done
+fi
+
+# Now we have 452 hours of training data.
+# Well create a subset with 10k short segments to make flat-start training easier:
+if [ $stage -le 7 ]; then
+  utils/subset_data_dir.sh --shortest data/train 10000 data/train_10kshort
+  utils/data/remove_dup_utts.sh 10 data/train_10kshort data/train_10kshort_nodup
+fi
+
+# Train
+if [ $stage -le 8 ]; then
+  steps/train_mono.sh --nj 20 --cmd "$train_cmd" \
+    data/train_10kshort_nodup data/lang_nosp exp/mono
+fi
+
+if [ $stage -le 9 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp exp/mono exp/mono_ali
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 30000 data/train data/lang_nosp exp/mono_ali exp/tri1
+fi
+
+if [ $stage -le 10 ]; then
+  utils/mkgraph.sh data/lang_nosp exp/tri1 exp/tri1/graph_nosp
+
+  # The slowest part about this decoding is the scoring, which we can't really
+  # control as the bottleneck is the NIST tools.
+  for dset in dev test; do
+    steps/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+      exp/tri1/graph_nosp data/${dset} exp/tri1/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh  --cmd "$decode_cmd" data/lang_nosp data/lang_nosp_rescore \
+       data/${dset} exp/tri1/decode_nosp_${dset} exp/tri1/decode_nosp_${dset}_rescore
+  done
+fi
+
+if [ $stage -le 11 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp exp/tri1 exp/tri1_ali
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    4000 50000 data/train data/lang_nosp exp/tri1_ali exp/tri2
+fi
+
+if [ $stage -le 12 ]; then
+  utils/mkgraph.sh data/lang_nosp exp/tri2 exp/tri2/graph_nosp
+  for dset in dev test; do
+    steps/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+      exp/tri2/graph_nosp data/${dset} exp/tri2/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh  --cmd "$decode_cmd" data/lang_nosp data/lang_nosp_rescore \
+       data/${dset} exp/tri2/decode_nosp_${dset} exp/tri2/decode_nosp_${dset}_rescore
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_nosp exp/tri2
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    data/local/dict_nosp exp/tri2/pron_counts_nowb.txt \
+    exp/tri2/sil_counts_nowb.txt \
+    exp/tri2/pron_bigram_counts_nowb.txt data/local/dict
+fi
+
+if [ $stage -le 14 ]; then
+  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
+  cp -rT data/lang data/lang_rescore
+  cp data/lang_nosp/G.fst data/lang/
+  cp data/lang_nosp_rescore/G.carpa data/lang_rescore/
+
+  utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph
+
+  for dset in dev test; do
+    steps/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+      exp/tri2/graph data/${dset} exp/tri2/decode_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset} exp/tri2/decode_${dset} exp/tri2/decode_${dset}_rescore
+  done
+fi
+
+if [ $stage -le 15 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang exp/tri2 exp/tri2_ali
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+    5000 100000 data/train data/lang exp/tri2_ali exp/tri3
+
+  utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph
+
+  for dset in dev test; do
+    steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+      exp/tri3/graph data/${dset} exp/tri3/decode_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset} exp/tri3/decode_${dset} exp/tri3/decode_${dset}_rescore
+  done
+fi
+
+
+if [ $stage -le 16 ]; then
+  # this does some data-cleaning.  It actually degrades the GMM-level results
+  # slightly, but the cleaned data should be useful when we add the neural net and chain
+  # systems.  If not we'll remove this stage.
+  local/run_cleanup_segmentation.sh
+fi
+
+
+if [ $stage -le 17 ]; then
+  # This will only work if you have GPUs on your system (and note that it requires
+  # you to have the queue set up the right way... see kaldi-asr.org/doc/queue.html)
+  local/chain/tuning/run_tdnn_PR2114.sh
+fi
+
+
+if [ $stage -le 18 ]; then
+  # todo add option to choose between training and downloading
+  if $train_rnnlm; then
+    local/rnnlm/tuning/run_lstm_tdnn_a.sh
+    local/rnnlm/average_rnnlm.sh
+  fi
+fi
+
+
+if [ $stage -le 19 ]; then
+  # Here we rescore the lattices generated at stage 17
+  rnnlm_dir=exp/rnnlm_lstm_tdnn_a_averaged
+  lang_dir=data/lang_chain
+  ngram_order=4
+
+  for set in dev test; do
+    data_dir=data/${set}_hires
+    decoding_dir=exp/chain/ # TODO path to tdnn dev and test decoding dirs
+    suffix=$(basename $rnnlm_dir)
+    output_dir=${decoding_dir}_$suffix
+
+    rnnlm/lmrescore_pruned.sh \
+      --cmd "$decode_cmd --mem 4G" \
+      --weight 0.5 --max-ngram-order $ngram_order \
+      $lang_dir $rnnlm_dir \
+      $data_dir $decoding_dir \
+      $output_dir
+  done
+fi
+
+
+echo "$0: success."
+exit 0

From cd00b07120d05aad3f1e27054a1013baa1670f69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Fri, 4 May 2018 16:19:57 +0200
Subject: [PATCH 02/35] add links and conf dir s5_r3

---
 egs/tedlium/s5_r3/conf/decode.config     |  1 +
 egs/tedlium/s5_r3/conf/decode_dnn.config |  2 ++
 egs/tedlium/s5_r3/conf/fbank.conf        |  5 +++++
 egs/tedlium/s5_r3/conf/mfcc.conf         |  2 ++
 egs/tedlium/s5_r3/conf/mfcc_hires.conf   | 10 ++++++++++
 egs/tedlium/s5_r3/conf/no_k20.conf       | 13 +++++++++++++
 egs/tedlium/s5_r3/conf/online_cmvn.conf  |  1 +
 egs/tedlium/s5_r3/conf/pitch.conf        |  2 ++
 egs/tedlium/s5_r3/rnnlm                  |  1 +
 egs/tedlium/s5_r3/steps                  |  1 +
 egs/tedlium/s5_r3/utils                  |  1 +
 11 files changed, 39 insertions(+)
 create mode 100644 egs/tedlium/s5_r3/conf/decode.config
 create mode 100644 egs/tedlium/s5_r3/conf/decode_dnn.config
 create mode 100644 egs/tedlium/s5_r3/conf/fbank.conf
 create mode 100644 egs/tedlium/s5_r3/conf/mfcc.conf
 create mode 100644 egs/tedlium/s5_r3/conf/mfcc_hires.conf
 create mode 100644 egs/tedlium/s5_r3/conf/no_k20.conf
 create mode 100644 egs/tedlium/s5_r3/conf/online_cmvn.conf
 create mode 100644 egs/tedlium/s5_r3/conf/pitch.conf
 create mode 120000 egs/tedlium/s5_r3/rnnlm
 create mode 120000 egs/tedlium/s5_r3/steps
 create mode 120000 egs/tedlium/s5_r3/utils

diff --git a/egs/tedlium/s5_r3/conf/decode.config b/egs/tedlium/s5_r3/conf/decode.config
new file mode 100644
index 00000000000..7ba966f2b83
--- /dev/null
+++ b/egs/tedlium/s5_r3/conf/decode.config
@@ -0,0 +1 @@
+# empty config, just use the defaults.
diff --git a/egs/tedlium/s5_r3/conf/decode_dnn.config b/egs/tedlium/s5_r3/conf/decode_dnn.config
new file mode 100644
index 00000000000..ab8dcc1dc08
--- /dev/null
+++ b/egs/tedlium/s5_r3/conf/decode_dnn.config
@@ -0,0 +1,2 @@
+beam=13.0 # beam for decoding.  Was 13.0 in the scripts.
+lattice_beam=8.0 # this has most effect on size of the lattices.
diff --git a/egs/tedlium/s5_r3/conf/fbank.conf b/egs/tedlium/s5_r3/conf/fbank.conf
new file mode 100644
index 00000000000..4c57f8a8765
--- /dev/null
+++ b/egs/tedlium/s5_r3/conf/fbank.conf
@@ -0,0 +1,5 @@
+--window-type=hamming # disable Dans window, use the standard
+--use-energy=false    # only fbank outputs
+--dither=1
+--num-mel-bins=40     # 8 filters/octave, 40 filters/16Khz as used by IBM
+--htk-compat=true     # try to make it compatible with HTK
diff --git a/egs/tedlium/s5_r3/conf/mfcc.conf b/egs/tedlium/s5_r3/conf/mfcc.conf
new file mode 100644
index 00000000000..32988403b00
--- /dev/null
+++ b/egs/tedlium/s5_r3/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false
+--sample-frequency=16000
diff --git a/egs/tedlium/s5_r3/conf/mfcc_hires.conf b/egs/tedlium/s5_r3/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..434834a6725
--- /dev/null
+++ b/egs/tedlium/s5_r3/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/tedlium/s5_r3/conf/no_k20.conf b/egs/tedlium/s5_r3/conf/no_k20.conf
new file mode 100644
index 00000000000..f0cba4df971
--- /dev/null
+++ b/egs/tedlium/s5_r3/conf/no_k20.conf
@@ -0,0 +1,13 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q
+option gpu=* -l gpu=$0 -q g.q
+default allow_k20=true
+option allow_k20=true
+option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*'
diff --git a/egs/tedlium/s5_r3/conf/online_cmvn.conf b/egs/tedlium/s5_r3/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/tedlium/s5_r3/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/tedlium/s5_r3/conf/pitch.conf b/egs/tedlium/s5_r3/conf/pitch.conf
new file mode 100644
index 00000000000..bba51335be3
--- /dev/null
+++ b/egs/tedlium/s5_r3/conf/pitch.conf
@@ -0,0 +1,2 @@
+--nccf-ballast-online=true  # helps for online operation.
+
diff --git a/egs/tedlium/s5_r3/rnnlm b/egs/tedlium/s5_r3/rnnlm
new file mode 120000
index 00000000000..e136939ba72
--- /dev/null
+++ b/egs/tedlium/s5_r3/rnnlm
@@ -0,0 +1 @@
+../../../scripts/rnnlm/
\ No newline at end of file
diff --git a/egs/tedlium/s5_r3/steps b/egs/tedlium/s5_r3/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/tedlium/s5_r3/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/tedlium/s5_r3/utils b/egs/tedlium/s5_r3/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/tedlium/s5_r3/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file

From 769809cfcaf7173463a648c073db52a624a6859c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Fri, 4 May 2018 16:36:42 +0200
Subject: [PATCH 03/35] add tdnnf best result script TODO header

---
 .../s5_r3/local/chain/tuning/run_tdnnf_1a.sh  | 252 ++++++++++++++++++
 1 file changed, 252 insertions(+)
 create mode 100755 egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh

diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
new file mode 100755
index 00000000000..9cf4e00a0b3
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
@@ -0,0 +1,252 @@
+#!/bin/bash
+
+# TODO clean this header !!!
+# run_tdnn_1f.sh is like run_tdnn_1e.sh but it use 2 to 6 jobs and add proportional-shrink 20.
+
+#exp/chain_cleaned/tdnn1e_sp_bi/: num-iters=253 nj=2..12 num-params=7.0M dim=40+100->3597 combine=-0.095->-0.095 xent:train/valid[167,252,final]=(-1.37,-1.31,-1.31/-1.47,-1.44,-1.44) logprob:train/valid[167,252,final]=(-0.087,-0.078,-0.078/-0.102,-0.099,-0.099)
+#exp/chain_cleaned/tdnn1f_sp_bi/: num-iters=444 nj=2..6 num-params=7.0M dim=40+100->3603 combine=-0.114->-0.113 xent:train/valid[295,443,final]=(-1.59,-1.51,-1.49/-1.58,-1.52,-1.50) logprob:train/valid[295,443,final]=(-0.112,-0.102,-0.098/-0.122,-0.113,-0.110)
+
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn1d_sp_bi exp/chain_cleaned/tdnn1e_sp_bi
+# System                 tdnn1e_sp_bi   tdnn1f_sp_bi
+# WER on dev(orig)            9.2           9.0
+# WER on dev(rescored)        8.6           8.2
+# WER on test(orig)           9.4           9.1
+# WER on test(rescored)       8.9           8.7
+# Final train prob         -0.0776       -0.0983
+# Final valid prob         -0.0992       -0.1103
+# Final train prob (xent)  -1.3110       -1.4893
+# Final valid prob (xent)  -1.4353       -1.4951
+
+## how you run this (note: this assumes that the run_tdnn.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn.sh
+
+# without cleanup:
+# local/chain/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run the corresponding non-chain nnet3 system
+# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# This script is like run_tdnn_1a.sh except it uses an xconfig-based mechanism
+# to get the configuration.
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=17
+nj=30
+decode_nj=30
+min_seg_len=1.55
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1a  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi_1024_ps10
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=1280
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn3l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn3 $opts dim=1280
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280
+  linear-component name=prefinal-l dim=256 $linear_opts
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+
+fi
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.proportional-shrink 10 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 6 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs false \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0

From 0bd925430b9a7ba126bf4d1b24708f58950ad4c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Fri, 4 May 2018 16:38:57 +0200
Subject: [PATCH 04/35] add some rnnlm scripts WIP

---
 .../s5_r3/local/rnnlm/average_rnnlm.sh        |  57 ++++++++++
 .../s5_r3/local/rnnlm/prepare_rnnlm_data.sh   |  61 +++++++++++
 .../local/rnnlm/tuning/run_lstm_tdnn_a.sh     | 101 ++++++++++++++++++
 3 files changed, 219 insertions(+)
 create mode 100755 egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh
 create mode 100755 egs/tedlium/s5_r3/local/rnnlm/prepare_rnnlm_data.sh
 create mode 100755 egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh

diff --git a/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh b/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh
new file mode 100755
index 00000000000..9ae9307d93d
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+#
+# Copyright 2018  François Hernandez (Ubiqus)
+#
+# This script takes a rnnlm_dir and averages its models.
+#
+# Takes the default rnnlm_dir of tedlium s5_r3 recipe,
+# and average the best model and the 10 previous and
+# following ones (if they exist).
+
+
+. ./cmd.sh
+. ./path.sh
+
+set -e -o pipefail -u
+
+rnnlm_dir=exp/rnnlm_lstm_tdnn_a
+begin=
+end=
+
+. utils/parse_options.sh # accept options
+
+# get the best iteration
+best_iter=$(rnnlm/get_best_model.py $dir)
+
+# get num_iters
+info=$(grep "num_iters" $rnnlm_dir/info.txt)
+num_iters=${info##*=}
+
+
+# test if begin and end exist
+if [ -z $begin ] && [ -z $end ]; then
+    begin=$(($best_iter-10))
+    end=$(($best_iter+10))
+    if [ $begin -le 1 ]; then
+        begin=1
+    fi
+    if [ ! $end -le $num_iters ]; then
+        end=$num_iters
+    fi
+fi
+
+# create list of models and embeddings files to merge
+models=""
+embeddings=""
+for num in $(seq -s' ' $begin $end); do
+	models=$models" $rnnlm_dir/$num.raw"
+	embeddings=$embeddings" $rnnlm_dir/feat_embedding.$num.mat"
+done
+
+# merge list of files
+nnet3-average $models ${rnnlm_dir}_averaged/final.raw
+matrix-sum --average=true $embeddings ${rnnlm_dir}_averaged/feat_embedding.final.mat
+
+# copy other files to averaged rnnlm_dir
+cp -r $rnnlm_dir/{info.txt,word_feats.txt,config,special_symbol_opts.txt} ${rnnlm_dir}_averaged
+
diff --git a/egs/tedlium/s5_r3/local/rnnlm/prepare_rnnlm_data.sh b/egs/tedlium/s5_r3/local/rnnlm/prepare_rnnlm_data.sh
new file mode 100755
index 00000000000..ba6252450da
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/rnnlm/prepare_rnnlm_data.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# To be run from the egs/ directory.
+
+. path.sh
+
+set -e -o pipefail -u
+
+# it should contain things like
+# foo.txt, bar.txt, and dev.txt (dev.txt is a special filename that's
+# obligatory).
+data_dir=data/rnnlm
+dir=exp/rnnlm/
+mkdir -p $dir
+
+# validata data dir
+rnnlm/validate_data_dir.py $data_dir/data/
+
+# get unigram counts
+rnnlm/get_unigram_counts.sh $data_dir/data/
+
+# get vocab
+mkdir -p $data_dir/vocab
+rnnlm/get_vocab.py $data_dir/data > $data_dir/vocab/words.txt
+
+# Choose weighting and multiplicity of data.
+# The following choices would mean that data-source 'foo'
+# is repeated once per epoch and has a weight of 0.5 in the
+# objective function when training, and data-source 'bar' is repeated twice
+# per epoch and has a data -weight of 1.5.
+# There is no contraint that the average of the data weights equal one.
+# Note: if a data-source has zero multiplicity, it just means you are ignoring
+# it; but you must include all data-sources.
+#cat > exp/foo/data_weights.txt <<EOF
+#foo 1   0.5
+#bar 2   1.5
+#baz 0   0.0
+#EOF
+cat > $dir/data_weights.txt <<EOF
+ted 1   1.0
+EOF
+
+# get unigram probs
+rnnlm/get_unigram_probs.py --vocab-file=$data_dir/vocab/words.txt \
+                           --data-weights-file=$dir/data_weights.txt \
+                           $data_dir/data > $dir/unigram_probs.txt
+
+# choose features
+rnnlm/choose_features.py --unigram-probs=$dir/unigram_probs.txt \
+                         $data_dir/vocab/words.txt > $dir/features.txt
+# validate features
+rnnlm/validate_features.py $dir/features.txt
+
+# make features for word
+rnnlm/make_word_features.py --unigram-probs=$dir/unigram_probs.txt \
+                         $data_dir/vocab/words.txt $dir/features.txt \
+                         > $dir/word_feats.txt
+
+# validate word features
+rnnlm/validate_word_features.py --features-file $dir/features.txt \
+                                $dir/word_feats.txt
diff --git a/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh
new file mode 100755
index 00000000000..9519ab3e87e
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)  Tony Robinson
+#           2017  Hainan Xu
+#           2017  Ke Li
+#           2018  François Hernandez (Ubiqus)
+#
+# rnnlm/train_rnnlm.sh: best iteration (out of 1060) was 1050, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 90.0 / 92.0.
+
+# Begin configuration section.
+dir=exp/rnnlm_lstm_tdnn_a
+embedding_dim=800
+lstm_rpd=200
+lstm_nrpd=200
+stage=-10
+train_stage=-10
+epochs=20
+
+. ./cmd.sh
+. utils/parse_options.sh
+[ -z "$cmd" ] && cmd=$train_cmd
+
+text_from_audio=data/train/text
+text=data/rnnlm/train.txt.shuffled
+wordlist=data/lang_chain/words.txt
+dev_sents=10000
+text_dir=data/rnnlm/text
+mkdir -p $dir/config
+set -e
+
+for f in $text $wordlist; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for local/prepare_data.sh and utils/prepare_lang.sh in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  # shuffle text from audio and lm
+  cat $text_from_audio | cut -d ' ' -f2- | cat $text |\
+    shuf > data/rnnlm/full_lm_data.shuffled
+  # create dev and train sets based on audio and LM data
+  cat data/rnnlm/full_lm_data.shuffled | head -n $dev_sents> $text_dir/dev.txt
+  cat data/rnnlm/full_lm_data.shuffled | tail -n +$[$dev_sents+1] > $text_dir/ted.txt
+
+fi
+
+if [ $stage -le 1 ]; then
+  cp $wordlist $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <unk> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+ted   1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --top-word-features=10000 \
+                           --min-frequency 1.0e-03 \
+                           --special-words='<s>,</s>,<brk>,<unk>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-2))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-1))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  # the --unigram-factor option is set larger than the default (100)
+  # in order to reduce the size of the sampling LM, because rnnlm-get-egs
+  # was taking up too much CPU (as much as 10 cores).
+  rnnlm/prepare_rnnlm_dir.sh --unigram-factor 200.0 \
+                             $text_dir $dir/config $dir
+fi
+echo "rnnlm dir done"
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 1 \
+                       --stage $train_stage --num-epochs $epochs --cmd "$cmd" $dir
+fi
+
+
+exit 0

From cafdebf9ee6b76fdf8bbc621fb6c8ffe8a3317d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Fri, 4 May 2018 17:03:45 +0200
Subject: [PATCH 05/35] add {cmd,path,results}.sh

---
 egs/tedlium/s5_r3/cmd.sh     | 27 +++++++++++++++++++++++++++
 egs/tedlium/s5_r3/path.sh    |  6 ++++++
 egs/tedlium/s5_r3/results.sh | 10 ++++++++++
 3 files changed, 43 insertions(+)
 create mode 100755 egs/tedlium/s5_r3/cmd.sh
 create mode 100755 egs/tedlium/s5_r3/path.sh
 create mode 100755 egs/tedlium/s5_r3/results.sh

diff --git a/egs/tedlium/s5_r3/cmd.sh b/egs/tedlium/s5_r3/cmd.sh
new file mode 100755
index 00000000000..66ae9090820
--- /dev/null
+++ b/egs/tedlium/s5_r3/cmd.sh
@@ -0,0 +1,27 @@
+# "queue.pl" uses qsub.  The options to it are
+# options to qsub.  If you have GridEngine installed,
+# change this to a queue you have access to.
+# Otherwise, use "run.pl", which will run jobs locally
+# (make sure your --num-jobs options are no more than
+# the number of cpus on your machine.
+
+# Run locally:
+#export train_cmd=run.pl
+#export decode_cmd=run.pl
+
+# JHU cluster (or most clusters using GridEngine, with a suitable
+# conf/queue.conf).
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+
+host=$(hostname -f)
+if [ ${host#*.} == "fit.vutbr.cz" ]; then
+  queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf,
+  export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2"
+  export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1"
+  export cuda_cmd="queue.pl --config $queue_conf --gpu 1 --mem 10G --tmp 40G"
+elif [ ${host#*.} == "cm.cluster" ]; then
+  # MARCC bluecrab cluster:
+  export train_cmd="slurm.pl --time 4:00:00 "
+  export decode_cmd="slurm.pl --mem 4G --time 4:00:00 "
+fi
diff --git a/egs/tedlium/s5_r3/path.sh b/egs/tedlium/s5_r3/path.sh
new file mode 100755
index 00000000000..16d5314b9c2
--- /dev/null
+++ b/egs/tedlium/s5_r3/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/tedlium/s5_r3/results.sh b/egs/tedlium/s5_r3/results.sh
new file mode 100755
index 00000000000..98bcab94ec5
--- /dev/null
+++ b/egs/tedlium/s5_r3/results.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+filter_regexp=.
+[ $# -ge 1 ] && filter_regexp=$1
+
+for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null
+  for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
+   for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
+exit 0
+

From 45b300b8e4f184e62a8b3f65272409e9f4ff58e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Fri, 4 May 2018 17:10:59 +0200
Subject: [PATCH 06/35] add some unchanged scripts from r2 to r3

---
 egs/tedlium/s5_r3/local/format_lms.sh         |  39 +++
 egs/tedlium/s5_r3/local/join_suffix.py        |  26 ++
 .../s5_r3/local/nnet3/run_ivector_common.sh   | 238 ++++++++++++++++++
 egs/tedlium/s5_r3/local/score.sh              |   1 +
 4 files changed, 304 insertions(+)
 create mode 100755 egs/tedlium/s5_r3/local/format_lms.sh
 create mode 100755 egs/tedlium/s5_r3/local/join_suffix.py
 create mode 100755 egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh
 create mode 120000 egs/tedlium/s5_r3/local/score.sh

diff --git a/egs/tedlium/s5_r3/local/format_lms.sh b/egs/tedlium/s5_r3/local/format_lms.sh
new file mode 100755
index 00000000000..bba5bbd17ec
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/format_lms.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#
+# Copyright  2014 Nickolay V. Shmyrev
+# Apache 2.0
+
+if [ -f path.sh ]; then . path.sh; fi
+
+
+small_arpa_lm=data/local/local_lm/data/arpa/4gram_small.arpa.gz
+big_arpa_lm=data/local/local_lm/data/arpa/4gram_big.arpa.gz
+
+for f in $small_arpa_lm $big_arpa_lm data/lang_nosp/words.txt; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+set -e
+
+if [ -f data/lang_nosp/G.fst ] && [ data/lang_nosp/G.fst -nt $small_arpa_lm ]; then
+  echo "$0: not regenerating data/lang_nosp/G.fst as it already exists and "
+  echo ".. is newer than the source LM."
+else
+  arpa2fst --disambig-symbol=#0 --read-symbol-table=data/lang_nosp/words.txt \
+    "gunzip -c $small_arpa_lm|" data/lang_nosp/G.fst
+  echo  "$0: Checking how stochastic G is (the first of these numbers should be small):"
+  fstisstochastic data/lang_nosp/G.fst || true
+  utils/validate_lang.pl --skip-determinization-check data/lang_nosp
+fi
+
+
+
+if [ -f data/lang_nosp_rescore/G.carpa ] && [ data/lang_nosp_rescore/G.carpa -nt $big_arpa_lm ] && \
+    [ data/lang_nosp_rescore/G.carpa -nt data/lang_nosp/words.txt ]; then
+  echo "$0: not regenerating data/lang_nosp_rescore/ as it seems to already by up to date."
+else
+  utils/build_const_arpa_lm.sh $big_arpa_lm data/lang_nosp data/lang_nosp_rescore || exit 1;
+fi
+
+exit 0;
diff --git a/egs/tedlium/s5_r3/local/join_suffix.py b/egs/tedlium/s5_r3/local/join_suffix.py
new file mode 100755
index 00000000000..64c62964331
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/join_suffix.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+#
+# Copyright  2014  Nickolay V. Shmyrev
+#            2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+
+import sys
+from codecs import open
+
+# This script joins together pairs of split-up words like "you 're" -> "you're".
+# The TEDLIUM transcripts are normalized in a way that's not traditional for
+# speech recognition.
+
+for line in sys.stdin:
+    items = line.split()
+    new_items = []
+    i = 1
+    while i < len(items):
+        if i < len(items) - 1 and items[i+1][0] == '\'':
+            new_items.append(items[i] + items[i+1])
+            i = i + 1
+        else:
+            new_items.append(items[i])
+        i = i + 1
+    print(items[0] + ' ' + ' '.join(new_items))
diff --git a/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh b/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..337092b1520
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,238 @@
+#!/bin/bash
+
+set -e -o pipefail
+
+
+# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually
+# be called by more scripts).  It contains the common feature preparation and iVector-related parts
+# of the script.  See those scripts for examples of usage.
+
+
+stage=0
+nj=30
+min_seg_len=1.55  # min length in seconds... we do this because chain training
+                  # will discard segments shorter than 1.5 seconds.   Must remain in sync
+                  # with the same option given to prepare_lores_feats_and_alignments.sh
+train_set=train_cleaned   # you might set this to e.g. train.
+gmm=tri3_cleaned          # This specifies a GMM-dir from the features of the type you're training the system on;
+                         # it should contain alignments for 'train_set'.
+
+num_threads_ubm=32
+nnet3_affix=_cleaned     # affix for exp/nnet3 directory to put iVector stuff in, so it
+                         # becomes exp/nnet3_cleaned or whatever.
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+
+
+if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
+  echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
+  echo " ... Please either remove it, or rerun this script with stage > 2."
+  exit 1
+fi
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: preparing directory for speed-perturbed data"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating high-resolution MFCC features"
+
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/tedlium-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp dev test; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
+
+  for datadir in ${train_set}_sp dev test; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires
+    steps/compute_cmvn_stats.sh data/${datadir}_hires
+    utils/fix_data_dir.sh data/${datadir}_hires
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data"
+  # we have to combine short segments or we won't be able to train chain models
+  # on those segments.
+  utils/data/combine_short_segments.sh \
+     data/${train_set}_sp_hires $min_seg_len data/${train_set}_sp_hires_comb
+
+  # just copy over the CMVN to avoid having to recompute it.
+  cp data/${train_set}_sp_hires/cmvn.scp data/${train_set}_sp_hires_comb/
+  utils/fix_data_dir.sh data/${train_set}_sp_hires_comb/
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: selecting segments of hires training data that were also present in the"
+  echo " ... original training data."
+
+  # note, these data-dirs are temporary; we put them in a sub-directory
+  # of the place where we'll make the alignments.
+  temp_data_root=exp/nnet3${nnet3_affix}/tri5
+  mkdir -p $temp_data_root
+
+  utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \
+          data/${train_set}_sp_hires $temp_data_root/${train_set}_hires
+
+  # note: essentially all the original segments should be in the hires data.
+  n1=$(wc -l <data/${train_set}/feats.scp)
+  n2=$(wc -l <$temp_data_root/${train_set}_hires/feats.scp)
+  if [ $n1 != $n1 ]; then
+    echo "$0: warning: number of feats $n1 != $n2, if these are very different it could be bad."
+  fi
+
+  echo "$0: training a system on the hires data for its LDA+MLLT transform, in order to produce the diagonal GMM."
+  if [ -e exp/nnet3${nnet3_affix}/tri5/final.mdl ]; then
+    # we don't want to overwrite old stuff, ask the user to delete it.
+    echo "$0: exp/nnet3${nnet3_affix}/tri5/final.mdl already exists: "
+    echo " ... please delete and then rerun, or use a later --stage option."
+    exit 1;
+  fi
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 7 --mllt-iters "2 4 6" \
+     --splice-opts "--left-context=3 --right-context=3" \
+     3000 10000 $temp_data_root/${train_set}_hires data/lang \
+      $gmm_dir exp/nnet3${nnet3_affix}/tri5
+fi
+
+
+if [ $stage -le 5 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  # train a diagonal UBM using a subset of about a quarter of the data
+  # we don't use the _comb data for this as there is no need for compatibility with
+  # the alignments, and using the non-combined data is more efficient for I/O
+  # (no messing about with piped commands).
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+      $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads $num_threads_ubm \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/tri5 exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 6 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/tedlium-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires_comb ${temp_data_root}/${train_set}_sp_hires_comb_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_hires_comb_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp) or small-segment concatenation (comb).
+  for data in dev test; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 9 ]; then
+  echo "$0: $feats already exists.  Refusing to overwrite the features "
+  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
+  exit 1;
+fi
+
+
+if [ $stage -le 8 ]; then
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh \
+    data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --nj $nj \
+    --cmd "$train_cmd" data/${train_set}_sp
+  steps/compute_cmvn_stats.sh data/${train_set}_sp
+  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
+  echo ".. speed-perturbed segments were too short."
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: combining short segments of low-resolution speed-perturbed  MFCC data"
+  src=data/${train_set}_sp
+  dest=data/${train_set}_sp_comb
+  utils/data/combine_short_segments.sh $src $min_seg_len $dest
+  # re-use the CMVN stats from the source directory, since it seems to be slow to
+  # re-compute them after concatenating short segments.
+  cp $src/cmvn.scp $dest/
+  utils/fix_data_dir.sh $dest
+fi
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning with the perturbed, short-segment-combined low-resolution data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+         data/${train_set}_sp_comb data/lang $gmm_dir $ali_dir
+fi
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r3/local/score.sh b/egs/tedlium/s5_r3/local/score.sh
new file mode 120000
index 00000000000..d89286dc25a
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/score.sh
@@ -0,0 +1 @@
+score_sclite.sh
\ No newline at end of file

From 79d139038f8fd207cb0781a02d4458d7ba5e3dd1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Tue, 22 May 2018 10:43:00 +0200
Subject: [PATCH 07/35] add download script

---
 egs/tedlium/s5_r3/local/download_data.sh | 38 ++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100755 egs/tedlium/s5_r3/local/download_data.sh

diff --git a/egs/tedlium/s5_r3/local/download_data.sh b/egs/tedlium/s5_r3/local/download_data.sh
new file mode 100755
index 00000000000..49de5b12372
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/download_data.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Copyright  2014  Nickolay V. Shmyrev
+#            2014  Brno University of Technology (Author: Karel Vesely)
+#            2016  John Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+mkdir -p db
+
+cd db  ### Note: the rest of this script is executed from the directory 'db'.
+
+# TED-LIUM database:
+if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then
+  if [ ! -e TEDLIUM_release-3 ]; then
+    ln -sf /export/corpora5/TEDLIUM_release-3
+  fi
+  echo "$0: linking the TEDLIUM data from /export/corpora5/TEDLIUM_release-3"
+else
+  if [ ! -e TEDLIUM_release-3 ]; then
+    echo "$0: downloading TEDLIUM_release-3 data (it won't re-download if it was already downloaded.)"
+    # the following command won't re-get it if it's already there
+    # because of the --continue switch.
+    wget --continue http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz || exit 1
+    tar xf "TEDLIUM_release-3.tar.gz"
+  else
+    echo "$0: not downloading or un-tarring TEDLIUM_release2 because it already exists."
+  fi
+fi
+
+
+num_sph=$(find TEDLIUM_release-3/data -name '*.sph' | wc -l)
+if [ "$num_sph" != 2351 ]; then
+  echo "$0: expected to find 2351 .sph files in the directory db/TEDLIUM_release-3, found $num_sph"
+  exit 1
+fi
+
+exit 0
+

From 58f7343a163c0b29fd86f85f1a6c8379844f8295 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Tue, 22 May 2018 10:56:54 +0200
Subject: [PATCH 08/35] local/prepare_data.sh

---
 egs/tedlium/s5_r3/local/prepare_data.sh | 76 +++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100755 egs/tedlium/s5_r3/local/prepare_data.sh

diff --git a/egs/tedlium/s5_r3/local/prepare_data.sh b/egs/tedlium/s5_r3/local/prepare_data.sh
new file mode 100755
index 00000000000..ea6241f7c29
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/prepare_data.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+#
+# Copyright  2014  Nickolay V. Shmyrev
+#            2014  Brno University of Technology (Author: Karel Vesely)
+#            2016  Johns Hopkins University (Author: Daniel Povey)
+#            2018  François Hernandez
+#
+# Apache 2.0
+
+# To be run from one directory above this script.
+
+. ./path.sh
+
+export LC_ALL=C
+
+# Prepare: test, train,
+for set in dev test train; do
+  dir=data/$set.orig
+  mkdir -p $dir
+
+  # Merge transcripts into a single 'stm' file, do some mappings:
+  # - <F0_M> -> <o,f0,male> : map dev stm labels to be coherent with train + test,
+  # - <F0_F> -> <o,f0,female> : --||--
+  # - (2) -> null : remove pronunciation variants in transcripts, keep in dictionary
+  # - <sil> -> null : remove marked <sil>, it is modelled implicitly (in kaldi)
+  # - (...) -> null : remove utterance names from end-lines of train
+  # - it 's -> it's : merge words that contain apostrophe (if compound in dictionary, local/join_suffix.py)
+  { # Add STM header, so sclite can prepare the '.lur' file
+    echo ';;
+;; LABEL "o" "Overall" "Overall results"
+;; LABEL "f0" "f0" "Wideband channel"
+;; LABEL "f2" "f2" "Telephone channel"
+;; LABEL "male" "Male" "Male Talkers"
+;; LABEL "female" "Female" "Female Talkers"
+;;'
+    # Process the STMs
+    cat db/TEDLIUM_release-3/legacy/$set/stm/*.stm | sort -k1,1 -k2,2 -k4,4n | \
+      sed -e 's:([^ ]*)$::' | \
+      awk '{ $2 = "A"; print $0; }'
+  } > data/$set.orig/stm
+
+  # Prepare 'text' file
+  # - {NOISE} -> [NOISE] : map the tags to match symbols in dictionary
+  cat $dir/stm | grep -v -e 'ignore_time_segment_in_scoring' -e ';;' | \
+    awk '{ printf ("%s-%07d-%07d", $1, $4*100, $5*100);
+           for (i=7;i<=NF;i++) { printf(" %s", $i); }
+           printf("\n");
+         }' | tr '{}' '[]' | sort -k1,1 > $dir/text || exit 1
+
+  # Prepare 'segments', 'utt2spk', 'spk2utt'
+  cat $dir/text | cut -d" " -f 1 | awk -F"-" '{printf("%s %s %07.2f %07.2f\n", $0, $1, $2/100.0, $3/100.0)}' > $dir/segments
+  cat $dir/segments | awk '{print $1, $2}' > $dir/utt2spk
+  cat $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt
+
+  # Prepare 'wav.scp', 'reco2file_and_channel'
+  cat $dir/spk2utt | awk -v set=$set -v pwd=$PWD '{ printf("%s sph2pipe -f wav -p %s/db/TEDLIUM_release-3/legacy/%s/sph/%s.sph |\n", $1, pwd, set, $1); }' > $dir/wav.scp
+  cat $dir/wav.scp | awk '{ print $1, $1, "A"; }' > $dir/reco2file_and_channel
+
+  # Create empty 'glm' file
+  echo ';; empty.glm
+  [FAKE]     =>  %HESITATION     / [ ] __ [ ] ;; hesitation token
+  ' > data/$set.orig/glm
+
+  # The training set seems to not have enough silence padding in the segmentations,
+  # especially at the beginning of segments.  Extend the times.
+  if [ $set == "train" ]; then
+    mv data/$set.orig/segments data/$set.orig/segments.temp
+    utils/data/extend_segment_times.py --start-padding=0.15 \
+      --end-padding=0.1 <data/$set.orig/segments.temp >data/$set.orig/segments || exit 1
+    rm data/$set.orig/segments.temp
+  fi
+
+  # Check that data dirs are okay!
+  utils/validate_data_dir.sh --no-feats $dir || exit 1
+done
+

From abae0fb849b1253fddcec354b628cd6e4c2d7156 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Tue, 22 May 2018 10:58:07 +0200
Subject: [PATCH 09/35] local/prepare_dict.sh

---
 egs/tedlium/s5_r3/local/prepare_dict.sh | 38 +++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100755 egs/tedlium/s5_r3/local/prepare_dict.sh

diff --git a/egs/tedlium/s5_r3/local/prepare_dict.sh b/egs/tedlium/s5_r3/local/prepare_dict.sh
new file mode 100755
index 00000000000..3cdbcb3fdf6
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/prepare_dict.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+#
+# Copyright  2014 Nickolay V. Shmyrev
+#            2014 Brno University of Technology (Author: Karel Vesely)
+#            2016 Daniel Galvez
+#            2016 Vincent Nguyen
+# Apache 2.0
+#
+
+dir=data/local/dict_nosp
+mkdir -p $dir
+
+srcdict=db//TEDLIUM_release-3/TEDLIUM.152k.dic
+
+[ ! -r $srcdict ] && echo "Missing $srcdict" && exit 1
+
+# Join dicts and fix some troubles
+cat $srcdict | grep -v -w "<s>" | grep -v -w "</s>" | grep -v -w "<unk>" | \
+  LANG= LC_ALL= sort | sed 's:([0-9])::g' > $dir/lexicon_words.txt
+
+cat $dir/lexicon_words.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \
+  grep -v SIL | sort > $dir/nonsilence_phones.txt
+
+( echo SIL; echo NSN ) > $dir/silence_phones.txt
+
+echo SIL > $dir/optional_silence.txt
+
+# No "extra questions" in the input to this setup, as we don't
+# have stress or tone.
+echo -n >$dir/extra_questions.txt
+
+# Add to the lexicon the silences, noises etc.
+# Typically, you would use "<UNK> NSN" here, but the Cantab Research language models
+# use <unk> instead of <UNK> to represent out of vocabulary words.
+echo '<unk> NSN' | cat - $dir/lexicon_words.txt | sort | uniq > $dir/lexicon.txt
+
+# Check that the dict dir is okay!
+utils/validate_dict_dir.pl $dir || exit 1

From 6a2ae2949e4ee33c430fc16c17dd5b81fb505fa1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Tue, 22 May 2018 11:36:49 +0200
Subject: [PATCH 10/35] add option to download lms

---
 egs/tedlium/s5_r3/local/ted_download_lm.sh |  16 +++
 egs/tedlium/s5_r3/local/ted_train_lm.sh    | 139 +++++++++++++++++++++
 egs/tedlium/s5_r3/run.sh                   |  15 ++-
 3 files changed, 166 insertions(+), 4 deletions(-)
 create mode 100644 egs/tedlium/s5_r3/local/ted_download_lm.sh
 create mode 100755 egs/tedlium/s5_r3/local/ted_train_lm.sh

diff --git a/egs/tedlium/s5_r3/local/ted_download_lm.sh b/egs/tedlium/s5_r3/local/ted_download_lm.sh
new file mode 100644
index 00000000000..ad833555b5f
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/ted_download_lm.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+#
+# Copyright  2018  David Snyder
+# Apache 2.0
+#
+# This script downloads pre-built language models trained on the Cantab-Tedlium
+# text data and Tedlium acoustic training data.  If you want to build these
+# models yourself, run the script local/ted_train_lm.sh.
+
+set -e
+
+echo "$0: downloading Tedlium 4 gram language models (it won't re-download if it was already downloaded.)"
+wget --continue http://kaldi-asr.org/models/5/4gram_small.arpa.gz -P data/local/local_lm/data/arpa || exit 1
+wget --continue http://kaldi-asr.org/models/5/4gram_big.arpa.gz -P data/local/local_lm/data/arpa || exit 1
+
+exit 0
\ No newline at end of file
diff --git a/egs/tedlium/s5_r3/local/ted_train_lm.sh b/egs/tedlium/s5_r3/local/ted_train_lm.sh
new file mode 100755
index 00000000000..20ea2ca3216
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/ted_train_lm.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+#
+# This script trains a LM on the Cantab-Tedlium text data and tedlium acoustic training data.
+# It is based on the example scripts distributed with PocoLM
+
+# It will first check if pocolm is installed and if not will process with installation
+# It will then get the source data from the pre-downloaded Cantab-Tedlium files
+# and the pre-prepared data/train text source.
+
+
+set -e
+stage=0
+
+echo "$0 $@"  # Print the command line for logging
+. utils/parse_options.sh || exit 1;
+
+dir=data/local/local_lm
+lm_dir=${dir}/data
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+num_dev_sentences=10000
+
+#bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.854,0.0722,0.5808,0.338,0.166,0.015,0.999,0.6228,0.340,0.172,0.999,0.788,0.501,0.406"
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # Unzip TEDLIUM 6 data sources, normalize apostrophe+suffix to previous word, gzip the result.
+  gunzip -c db/TEDLIUM_release-3/LM/*.en.gz | sed 's/ <\/s>//g' | gzip -c  > ${dir}/data/text/train.txt.gz
+  # use a subset of the annotated training data as the dev set .
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+  head -n $num_dev_sentences < data/train/text | cut -d " " -f 2-  > ${dir}/data/text/dev.txt
+  # .. and the rest of the training data as an additional data source.
+  # we can later fold the dev data into this.
+  tail -n +$[$num_dev_sentences+1] < data/train/text | cut -d " " -f 2- >  ${dir}/data/text/ted.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (a subset of the training data is used as ${dir}/data/text/ted.txt to work
+  # out interpolation weights.
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < data/dev/text  > ${dir}/data/real_dev_set.txt
+
+  # get wordlist
+  awk '{print $1}' db/TEDLIUM_release-3/TEDLIUM.152k.dic | sed 's:([0-9])::g' | sort | uniq > ${dir}/data/wordlist
+fi
+
+order=4
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='train=2 ted=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20  \
+               --limit-unk-history=true \
+               --fold-dev-into=ted ${bypass_metaparam_optim_opt} \
+               --min-counts="${min_counts}" \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+  #[perplexity = 157.87] over 18290.0 words
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: pruning the LM (to larger size)"
+  # Using 10 million n-grams for a big LM for rescoring purposes.
+  size=10000000
+  prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
+
+  # current results, after adding --limit-unk-history=true:
+  # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_big was -5.16562818753 per word [perplexity = 175.147449465] over 18290.0 words.
+
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: pruning the LM (to smaller size)"
+  # Using 2 million n-grams for a smaller LM for graph building.  Prune from the
+  # bigger-pruned LM, it'll be faster.
+  size=2000000
+  prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
+
+  # current results, after adding --limit-unk-history=true (needed for modeling OOVs and not blowing up LG.fst):
+  # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_small was -5.29432352378 per word [perplexity = 199.202824404 over 18290.0 words.
+
+
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
+fi
diff --git a/egs/tedlium/s5_r3/run.sh b/egs/tedlium/s5_r3/run.sh
index 7147476fe52..ad568891ddb 100755
--- a/egs/tedlium/s5_r3/run.sh
+++ b/egs/tedlium/s5_r3/run.sh
@@ -3,7 +3,7 @@
 # Based mostly on the Switchboard recipe. The training database is TED-LIUM,
 # it consists of TED talks with cleaned automatic transcripts:
 #
-# http://www-lium.univ-lemans.fr/en/content/ted-lium-corpus
+# https://lium.univ-lemans.fr/ted-lium3/
 # http://www.openslr.org/resources (Mirror).
 #
 # The data is distributed under 'Creative Commons BY-NC-ND 3.0' license,
@@ -28,8 +28,9 @@ nj=35
 decode_nj=30   # note: should not be >38 which is the number of speakers in the dev set
                # after applying --seconds-per-spk-max 180.  We decode with 4 threads, so
                # this will be too many jobs if you're using run.pl.
-stage=0
+stage=5
 train_rnnlm=true
+train_lms=false
 
 . utils/parse_options.sh # accept options
 
@@ -63,13 +64,19 @@ if [ $stage -le 4 ]; then
   # later on we'll change this script so you have the option to
   # download the pre-built LMs from openslr.org instead of building them
   # locally.
-  local/ted_train_lm.sh
+  if $train_lms; then
+    local/ted_train_lm.sh
+  else
+    local/ted_download_lm.sh
+  fi
 fi
 
 if [ $stage -le 5 ]; then
   local/format_lms.sh
 fi
 
+exit
+
 # Feature extraction
 if [ $stage -le 6 ]; then
   for set in test dev train; do
@@ -202,7 +209,7 @@ if [ $stage -le 19 ]; then
   lang_dir=data/lang_chain
   ngram_order=4
 
-  for set in dev test; do
+  for dset in dev test; do
     data_dir=data/${set}_hires
     decoding_dir=exp/chain/ # TODO path to tdnn dev and test decoding dirs
     suffix=$(basename $rnnlm_dir)

From 1ac8696a8ccd76872a28b58b3f2ccf636cd21e26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Tue, 22 May 2018 11:40:20 +0200
Subject: [PATCH 11/35] remove local/join_suffix.py

---
 egs/tedlium/s5_r3/local/join_suffix.py | 26 --------------------------
 1 file changed, 26 deletions(-)
 delete mode 100755 egs/tedlium/s5_r3/local/join_suffix.py

diff --git a/egs/tedlium/s5_r3/local/join_suffix.py b/egs/tedlium/s5_r3/local/join_suffix.py
deleted file mode 100755
index 64c62964331..00000000000
--- a/egs/tedlium/s5_r3/local/join_suffix.py
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright  2014  Nickolay V. Shmyrev
-#            2016  Johns Hopkins University (author: Daniel Povey)
-# Apache 2.0
-
-
-import sys
-from codecs import open
-
-# This script joins together pairs of split-up words like "you 're" -> "you're".
-# The TEDLIUM transcripts are normalized in a way that's not traditional for
-# speech recognition.
-
-for line in sys.stdin:
-    items = line.split()
-    new_items = []
-    i = 1
-    while i < len(items):
-        if i < len(items) - 1 and items[i+1][0] == '\'':
-            new_items.append(items[i] + items[i+1])
-            i = i + 1
-        else:
-            new_items.append(items[i])
-        i = i + 1
-    print(items[0] + ' ' + ' '.join(new_items))

From ceb03deace54a21c0d3b51ac5ecee0bedcc009e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Tue, 22 May 2018 11:43:35 +0200
Subject: [PATCH 12/35] local/run_cleanup_segmentation.sh stage 16

---
 .../s5_r3/local/run_cleanup_segmentation.sh   | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100755 egs/tedlium/s5_r3/local/run_cleanup_segmentation.sh

diff --git a/egs/tedlium/s5_r3/local/run_cleanup_segmentation.sh b/egs/tedlium/s5_r3/local/run_cleanup_segmentation.sh
new file mode 100755
index 00000000000..559d20046dd
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/run_cleanup_segmentation.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Copyright 2016  Vimal Manohar
+#           2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script demonstrates how to re-segment training data selecting only the
+# "good" audio that matches the transcripts.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# biased language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+
+# For nnet3 and chain results after cleanup, see the scripts in
+# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
+
+# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets
+# [will add these later].
+
+set -e
+set -o pipefail
+set -u
+
+stage=0
+cleanup_stage=0
+data=data/train
+cleanup_affix=cleaned
+srcdir=exp/tri3
+nj=100
+decode_nj=16
+decode_num_threads=4
+
+. ./path.sh
+. ./cmd.sh
+. utils/parse_options.sh
+
+cleaned_data=${data}_${cleanup_affix}
+
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+if [ $stage -le 1 ]; then
+  # This does the actual data cleanup.
+  steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \
+    $data data/lang $srcdir $dir $cleaned_data
+fi
+
+if [ $stage -le 2 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data data/lang $srcdir ${srcdir}_ali_${cleanup_affix}
+fi
+
+if [ $stage -le 3 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" \
+    5000 100000 $cleaned_data data/lang ${srcdir}_ali_${cleanup_affix} ${cleaned_dir}
+fi
+
+if [ $stage -le 4 ]; then
+  # Test with the models trained on cleaned-up data.
+  utils/mkgraph.sh data/lang ${cleaned_dir} ${cleaned_dir}/graph
+
+  for dset in dev test; do
+    steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \
+       --cmd "$decode_cmd"  --num-threads 4 \
+       ${cleaned_dir}/graph data/${dset} ${cleaned_dir}/decode_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset} ${cleaned_dir}/decode_${dset} ${cleaned_dir}/decode_${dset}_rescore
+  done
+fi

From 52f70e12ea970b7e52ab6ed812bf4c2a97117ee3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Tue, 22 May 2018 12:10:50 +0200
Subject: [PATCH 13/35] add run_tdnnf.sh link

---
 egs/tedlium/s5_r3/local/chain/run_tdnnf.sh | 1 +
 egs/tedlium/s5_r3/run.sh                   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)
 create mode 120000 egs/tedlium/s5_r3/local/chain/run_tdnnf.sh

diff --git a/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh b/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh
new file mode 120000
index 00000000000..cbbf0ed6533
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh
@@ -0,0 +1 @@
+tuning/run_tdnnf_1a.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r3/run.sh b/egs/tedlium/s5_r3/run.sh
index ad568891ddb..d4c123abfaa 100755
--- a/egs/tedlium/s5_r3/run.sh
+++ b/egs/tedlium/s5_r3/run.sh
@@ -190,7 +190,7 @@ fi
 if [ $stage -le 17 ]; then
   # This will only work if you have GPUs on your system (and note that it requires
   # you to have the queue set up the right way... see kaldi-asr.org/doc/queue.html)
-  local/chain/tuning/run_tdnn_PR2114.sh
+  local/chain/run_tdnnf.sh
 fi
 
 

From 252c70d3b38c1588945c974a504538ae64a9beeb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Tue, 22 May 2018 12:24:52 +0200
Subject: [PATCH 14/35] clean header chain scripts

---
 .../s5_r3/local/chain/tuning/run_tdnn_1a.sh   | 235 ++++++++++++++++++
 .../s5_r3/local/chain/tuning/run_tdnnf_1a.sh  |  34 ++-
 2 files changed, 250 insertions(+), 19 deletions(-)
 create mode 100755 egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh

diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..499fa7f6d49
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,235 @@
+#!/bin/bash
+
+# See run_tdnnf_1a.sh for comparative results.
+
+## how you run this (note: this assumes that the run_tdnn.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn.sh
+
+# without cleanup:
+# local/chain/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run the corresponding non-chain nnet3 system
+# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# This script is like run_tdnn_1a.sh except it uses an xconfig-based mechanism
+# to get the configuration.
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1f  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+export CUDA_VISIBLE_DEVICES=0,1,2
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi_1024_ps10
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=1024 self-repair-scale=1.0e-04
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=1024
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=1024
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=1024 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=1024 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+
+fi
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.proportional-shrink 10 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 6 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs false \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
index 9cf4e00a0b3..8570e54c626 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
@@ -1,21 +1,19 @@
 #!/bin/bash
 
-# TODO clean this header !!!
-# run_tdnn_1f.sh is like run_tdnn_1e.sh but it use 2 to 6 jobs and add proportional-shrink 20.
-
-#exp/chain_cleaned/tdnn1e_sp_bi/: num-iters=253 nj=2..12 num-params=7.0M dim=40+100->3597 combine=-0.095->-0.095 xent:train/valid[167,252,final]=(-1.37,-1.31,-1.31/-1.47,-1.44,-1.44) logprob:train/valid[167,252,final]=(-0.087,-0.078,-0.078/-0.102,-0.099,-0.099)
-#exp/chain_cleaned/tdnn1f_sp_bi/: num-iters=444 nj=2..6 num-params=7.0M dim=40+100->3603 combine=-0.114->-0.113 xent:train/valid[295,443,final]=(-1.59,-1.51,-1.49/-1.58,-1.52,-1.50) logprob:train/valid[295,443,final]=(-0.112,-0.102,-0.098/-0.122,-0.113,-0.110)
-
-# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn1d_sp_bi exp/chain_cleaned/tdnn1e_sp_bi
-# System                 tdnn1e_sp_bi   tdnn1f_sp_bi
-# WER on dev(orig)            9.2           9.0
-# WER on dev(rescored)        8.6           8.2
-# WER on test(orig)           9.4           9.1
-# WER on test(rescored)       8.9           8.7
-# Final train prob         -0.0776       -0.0983
-# Final valid prob         -0.0992       -0.1103
-# Final train prob (xent)  -1.3110       -1.4893
-# Final valid prob (xent)  -1.4353       -1.4951
+# run_tdnnf_1a.sh is the script which results are presented in the corpus release paper.
+# It use 2 to 6 jobs and add proportional-shrink 10.
+
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_1a exp/chain_cleaned/tdnnf_1a
+# System                    tdnn_1a   tdnnf_1a
+# WER on dev(orig)            8.2       7.9
+# WER on dev(rescored)        7.6       7.2
+# WER on test(orig)           8.1       8.0
+# WER on test(rescored)       7.7       7.5
+# Final train prob          -0.0802   -0.0779
+# Final valid prob          -0.0980   -0.0906
+# Final train prob (xent)   -1.1450   -0.9021
+# Final valid prob (xent)   -1.2498   -0.9971
+
 
 ## how you run this (note: this assumes that the run_tdnn.sh soft link points here;
 ## otherwise call it directly in its location).
@@ -28,14 +26,12 @@
 # note, if you have already run the corresponding non-chain nnet3 system
 # (local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
 
-# This script is like run_tdnn_1a.sh except it uses an xconfig-based mechanism
-# to get the configuration.
 
 set -e -o pipefail
 
 # First the options that are passed through to run_ivector_common.sh
 # (some of which are also used in this script directly).
-stage=17
+stage=0
 nj=30
 decode_nj=30
 min_seg_len=1.55

From 6f9bd8b1c8e5e257faeebab9c17b5023060b85a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Tue, 22 May 2018 14:24:30 +0200
Subject: [PATCH 15/35] clean chain tuning naming

---
 egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh  | 6 ++----
 egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh | 4 ++--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
index 499fa7f6d49..7a393db663c 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
@@ -34,8 +34,8 @@ nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
 # are just hardcoded at this level, in the commands below.
 train_stage=-10
 tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_affix=1f  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=exp/chain_cleaned/egs  # you can set this to use previously dumped egs.
+tdnn_affix=_1a  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir= # you can set this to use previously dumped egs.
 
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
@@ -44,8 +44,6 @@ echo "$0 $@"  # Print the command line for logging
 . ./path.sh
 . ./utils/parse_options.sh
 
-export CUDA_VISIBLE_DEVICES=0,1,2
-
 
 if ! cuda-compiled; then
   cat <<EOF && exit 1
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
index 8570e54c626..a5ce24454b1 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
@@ -45,7 +45,7 @@ nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
 # are just hardcoded at this level, in the commands below.
 train_stage=-10
 tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_affix=1a  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=f_1a  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
 common_egs_dir=  # you can set this to use previously dumped egs.
 
 # End configuration section.
@@ -77,7 +77,7 @@ gmm_dir=exp/$gmm
 ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
 tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
 lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi_1024_ps10
+dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}
 train_data_dir=data/${train_set}_sp_hires_comb
 lores_train_data_dir=data/${train_set}_sp_comb
 train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb

From c66f9c2ac77d09ccac5a0328e8ca8da46f4d6ed4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Tue, 22 May 2018 14:25:34 +0200
Subject: [PATCH 16/35] some lm related scripts

---
 egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh |  9 ++++++---
 egs/tedlium/s5_r3/local/ted_download_lm.sh     |  0
 egs/tedlium/s5_r3/local/ted_download_rnnlm.sh  | 17 +++++++++++++++++
 3 files changed, 23 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 egs/tedlium/s5_r3/local/ted_download_lm.sh
 create mode 100755 egs/tedlium/s5_r3/local/ted_download_rnnlm.sh

diff --git a/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh b/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh
index 9ae9307d93d..61ad07645ff 100755
--- a/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh
+++ b/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh
@@ -21,7 +21,7 @@ end=
 . utils/parse_options.sh # accept options
 
 # get the best iteration
-best_iter=$(rnnlm/get_best_model.py $dir)
+best_iter=$(rnnlm/get_best_model.py $rnnlm_dir)
 
 # get num_iters
 info=$(grep "num_iters" $rnnlm_dir/info.txt)
@@ -44,11 +44,14 @@ fi
 models=""
 embeddings=""
 for num in $(seq -s' ' $begin $end); do
-	models=$models" $rnnlm_dir/$num.raw"
-	embeddings=$embeddings" $rnnlm_dir/feat_embedding.$num.mat"
+    [ -f $rnnlm_dir/$num.raw ] && \
+        models=$models" $rnnlm_dir/$num.raw"
+	[ -f $rnnlm_dir/feat_embedding.$num.mat ] && \
+        embeddings=$embeddings" $rnnlm_dir/feat_embedding.$num.mat"
 done
 
 # merge list of files
+mkdir -p ${rnnlm_dir}_averaged
 nnet3-average $models ${rnnlm_dir}_averaged/final.raw
 matrix-sum --average=true $embeddings ${rnnlm_dir}_averaged/feat_embedding.final.mat
 
diff --git a/egs/tedlium/s5_r3/local/ted_download_lm.sh b/egs/tedlium/s5_r3/local/ted_download_lm.sh
old mode 100644
new mode 100755
diff --git a/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh
new file mode 100755
index 00000000000..fb85be9e897
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+#
+# Copyright  2018  François Hernandez
+# Apache 2.0
+#
+# This script downloads pre-built RNN language models trained on the TED-LIUM
+# text data and acoustic training data.  If you want to build these
+# models yourself, run the script local/ted_train_rnnlm.sh.
+
+set -e
+
+echo "$0: downloading Tedlium RNNLM models (it won't re-download if it was already downloaded.)"
+wget --continue http://kaldi-asr.org/models/6/tedlium_rnnlm.tgz -P exp/rnnlm_lstm_tdnn_a_averaged || exit 1
+tar -xvzf exp/rnnlm_lstm_tdnn_a_averaged/tedlium_rnnlm.tgz || exit 1
+rm exp/rnnlm_lstm_tdnn_a_averaged/tedlium_rnnlm.tgz
+
+exit 0
\ No newline at end of file

From 442d22c1ef63beade5d2951e622502649a815001 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Tue, 22 May 2018 14:29:10 +0200
Subject: [PATCH 17/35] minor change run.sh

---
 egs/tedlium/s5_r3/run.sh | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/egs/tedlium/s5_r3/run.sh b/egs/tedlium/s5_r3/run.sh
index d4c123abfaa..7bb384fe314 100755
--- a/egs/tedlium/s5_r3/run.sh
+++ b/egs/tedlium/s5_r3/run.sh
@@ -29,8 +29,8 @@ decode_nj=30   # note: should not be >38 which is the number of speakers in the
                # after applying --seconds-per-spk-max 180.  We decode with 4 threads, so
                # this will be too many jobs if you're using run.pl.
 stage=5
-train_rnnlm=true
-train_lms=false
+train_rnnlm=false
+train_lm=false
 
 . utils/parse_options.sh # accept options
 
@@ -64,7 +64,7 @@ if [ $stage -le 4 ]; then
   # later on we'll change this script so you have the option to
   # download the pre-built LMs from openslr.org instead of building them
   # locally.
-  if $train_lms; then
+  if $train_lm; then
     local/ted_train_lm.sh
   else
     local/ted_download_lm.sh
@@ -195,14 +195,18 @@ fi
 
 
 if [ $stage -le 18 ]; then
-  # todo add option to choose between training and downloading
+  # You can either train your own rnnlm or download a pre-trained one
   if $train_rnnlm; then
     local/rnnlm/tuning/run_lstm_tdnn_a.sh
     local/rnnlm/average_rnnlm.sh
+  else
+    local/ted_download_rnnlm.sh
   fi
 fi
 
 
+
+
 if [ $stage -le 19 ]; then
   # Here we rescore the lattices generated at stage 17
   rnnlm_dir=exp/rnnlm_lstm_tdnn_a_averaged
@@ -211,7 +215,7 @@ if [ $stage -le 19 ]; then
 
   for dset in dev test; do
     data_dir=data/${set}_hires
-    decoding_dir=exp/chain/ # TODO path to tdnn dev and test decoding dirs
+    decoding_dir=exp/chain/tdnnf_1a
     suffix=$(basename $rnnlm_dir)
     output_dir=${decoding_dir}_$suffix
 

From 50109119d048c1fc1c51b4be89d1459b3a59672c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Tue, 22 May 2018 14:30:44 +0200
Subject: [PATCH 18/35] reset stage run

---
 egs/tedlium/s5_r3/run.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/egs/tedlium/s5_r3/run.sh b/egs/tedlium/s5_r3/run.sh
index 7bb384fe314..9bf240dd9d3 100755
--- a/egs/tedlium/s5_r3/run.sh
+++ b/egs/tedlium/s5_r3/run.sh
@@ -28,7 +28,7 @@ nj=35
 decode_nj=30   # note: should not be >38 which is the number of speakers in the dev set
                # after applying --seconds-per-spk-max 180.  We decode with 4 threads, so
                # this will be too many jobs if you're using run.pl.
-stage=5
+stage=0
 train_rnnlm=false
 train_lm=false
 
@@ -75,8 +75,6 @@ if [ $stage -le 5 ]; then
   local/format_lms.sh
 fi
 
-exit
-
 # Feature extraction
 if [ $stage -le 6 ]; then
   for set in test dev train; do

From dbb4440812d6afa090827e128cfd1d95a0e4cfa4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Tue, 22 May 2018 14:32:29 +0200
Subject: [PATCH 19/35] cosmetic

---
 egs/tedlium/s5_r3/run.sh | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/egs/tedlium/s5_r3/run.sh b/egs/tedlium/s5_r3/run.sh
index 9bf240dd9d3..74904fbd1ac 100755
--- a/egs/tedlium/s5_r3/run.sh
+++ b/egs/tedlium/s5_r3/run.sh
@@ -176,7 +176,6 @@ if [ $stage -le 15 ]; then
   done
 fi
 
-
 if [ $stage -le 16 ]; then
   # this does some data-cleaning.  It actually degrades the GMM-level results
   # slightly, but the cleaned data should be useful when we add the neural net and chain
@@ -184,14 +183,12 @@ if [ $stage -le 16 ]; then
   local/run_cleanup_segmentation.sh
 fi
 
-
 if [ $stage -le 17 ]; then
   # This will only work if you have GPUs on your system (and note that it requires
   # you to have the queue set up the right way... see kaldi-asr.org/doc/queue.html)
   local/chain/run_tdnnf.sh
 fi
 
-
 if [ $stage -le 18 ]; then
   # You can either train your own rnnlm or download a pre-trained one
   if $train_rnnlm; then
@@ -202,9 +199,6 @@ if [ $stage -le 18 ]; then
   fi
 fi
 
-
-
-
 if [ $stage -le 19 ]; then
   # Here we rescore the lattices generated at stage 17
   rnnlm_dir=exp/rnnlm_lstm_tdnn_a_averaged
@@ -226,6 +220,5 @@ if [ $stage -le 19 ]; then
   done
 fi
 
-
 echo "$0: success."
 exit 0

From 62b8826611ecc4a85554f3e03f494d87090591b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Tue, 22 May 2018 14:37:42 +0200
Subject: [PATCH 20/35] add rnnlm results

---
 egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh
index 9519ab3e87e..302c67d1243 100755
--- a/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh
+++ b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh
@@ -8,6 +8,14 @@
 # rnnlm/train_rnnlm.sh: best iteration (out of 1060) was 1050, linking it to final iteration.
 # rnnlm/train_rnnlm.sh: train/dev perplexity was 90.0 / 92.0.
 
+# System                    tdnn_1a   tdnnf_1a
+# WER on dev(orig)            8.2       7.9
+# WER on dev(ngram)           7.6       7.2
+# WER on dev(rnnlm)           6.3       6.1
+# WER on test(orig)           8.1       8.0
+# WER on test(ngram)          7.7       7.5
+# WER on test(rnnlm)          6.7       6.6
+
 # Begin configuration section.
 dir=exp/rnnlm_lstm_tdnn_a
 embedding_dim=800

From cc6284183c4df5533b682b5fa2262a5f7b110b80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Tue, 22 May 2018 14:42:58 +0200
Subject: [PATCH 21/35] LM corpus for rnnlm

---
 egs/tedlium/s5_r3/local/prepare_data.sh                 | 3 +++
 egs/tedlium/s5_r3/local/rnnlm/run_lstm_tdnn.sh          | 1 +
 egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)
 create mode 120000 egs/tedlium/s5_r3/local/rnnlm/run_lstm_tdnn.sh

diff --git a/egs/tedlium/s5_r3/local/prepare_data.sh b/egs/tedlium/s5_r3/local/prepare_data.sh
index ea6241f7c29..c8a9e0a8665 100755
--- a/egs/tedlium/s5_r3/local/prepare_data.sh
+++ b/egs/tedlium/s5_r3/local/prepare_data.sh
@@ -13,6 +13,9 @@
 
 export LC_ALL=C
 
+# Prepare LM data
+gunzip -c db/TEDLIUM_release-3/LM/*.en.gz | sed 's/ <\/s>//g' | gzip -c  > data/LM/train.txt
+
 # Prepare: test, train,
 for set in dev test train; do
   dir=data/$set.orig
diff --git a/egs/tedlium/s5_r3/local/rnnlm/run_lstm_tdnn.sh b/egs/tedlium/s5_r3/local/rnnlm/run_lstm_tdnn.sh
new file mode 120000
index 00000000000..72a3172db41
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/rnnlm/run_lstm_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_lstm_tdnn_a.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh
index 302c67d1243..32252db937d 100755
--- a/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh
+++ b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh
@@ -30,7 +30,7 @@ epochs=20
 [ -z "$cmd" ] && cmd=$train_cmd
 
 text_from_audio=data/train/text
-text=data/rnnlm/train.txt.shuffled
+text=data/LM/train.txt
 wordlist=data/lang_chain/words.txt
 dev_sents=10000
 text_dir=data/rnnlm/text

From c022a0a813295e9344c43611114735d9af8a439b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Tue, 22 May 2018 14:49:16 +0200
Subject: [PATCH 22/35] remove useless config files

---
 egs/tedlium/s5_r3/conf/decode.config     |  1 -
 egs/tedlium/s5_r3/conf/decode_dnn.config |  2 --
 egs/tedlium/s5_r3/conf/fbank.conf        |  5 -----
 egs/tedlium/s5_r3/conf/no_k20.conf       | 13 -------------
 egs/tedlium/s5_r3/conf/pitch.conf        |  2 --
 5 files changed, 23 deletions(-)
 delete mode 100644 egs/tedlium/s5_r3/conf/decode.config
 delete mode 100644 egs/tedlium/s5_r3/conf/decode_dnn.config
 delete mode 100644 egs/tedlium/s5_r3/conf/fbank.conf
 delete mode 100644 egs/tedlium/s5_r3/conf/no_k20.conf
 delete mode 100644 egs/tedlium/s5_r3/conf/pitch.conf

diff --git a/egs/tedlium/s5_r3/conf/decode.config b/egs/tedlium/s5_r3/conf/decode.config
deleted file mode 100644
index 7ba966f2b83..00000000000
--- a/egs/tedlium/s5_r3/conf/decode.config
+++ /dev/null
@@ -1 +0,0 @@
-# empty config, just use the defaults.
diff --git a/egs/tedlium/s5_r3/conf/decode_dnn.config b/egs/tedlium/s5_r3/conf/decode_dnn.config
deleted file mode 100644
index ab8dcc1dc08..00000000000
--- a/egs/tedlium/s5_r3/conf/decode_dnn.config
+++ /dev/null
@@ -1,2 +0,0 @@
-beam=13.0 # beam for decoding.  Was 13.0 in the scripts.
-lattice_beam=8.0 # this has most effect on size of the lattices.
diff --git a/egs/tedlium/s5_r3/conf/fbank.conf b/egs/tedlium/s5_r3/conf/fbank.conf
deleted file mode 100644
index 4c57f8a8765..00000000000
--- a/egs/tedlium/s5_r3/conf/fbank.conf
+++ /dev/null
@@ -1,5 +0,0 @@
---window-type=hamming # disable Dans window, use the standard
---use-energy=false    # only fbank outputs
---dither=1
---num-mel-bins=40     # 8 filters/octave, 40 filters/16Khz as used by IBM
---htk-compat=true     # try to make it compatible with HTK
diff --git a/egs/tedlium/s5_r3/conf/no_k20.conf b/egs/tedlium/s5_r3/conf/no_k20.conf
deleted file mode 100644
index f0cba4df971..00000000000
--- a/egs/tedlium/s5_r3/conf/no_k20.conf
+++ /dev/null
@@ -1,13 +0,0 @@
-# Default configuration
-command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
-option mem=* -l mem_free=$0,ram_free=$0
-option mem=0          # Do not add anything to qsub_opts
-option num_threads=* -pe smp $0
-option num_threads=1  # Do not add anything to qsub_opts
-option max_jobs_run=* -tc $0
-default gpu=0
-option gpu=0 -q all.q
-option gpu=* -l gpu=$0 -q g.q
-default allow_k20=true
-option allow_k20=true
-option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*'
diff --git a/egs/tedlium/s5_r3/conf/pitch.conf b/egs/tedlium/s5_r3/conf/pitch.conf
deleted file mode 100644
index bba51335be3..00000000000
--- a/egs/tedlium/s5_r3/conf/pitch.conf
+++ /dev/null
@@ -1,2 +0,0 @@
---nccf-ballast-online=true  # helps for online operation.
-

From 774b2533b07a8c9c90a4af91325334dcaeb26009 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Wed, 23 May 2018 09:39:16 +0200
Subject: [PATCH 23/35] remove host stuff from cmd.sh

---
 egs/tedlium/s5_r3/cmd.sh | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/egs/tedlium/s5_r3/cmd.sh b/egs/tedlium/s5_r3/cmd.sh
index 66ae9090820..23a2b7b6a51 100755
--- a/egs/tedlium/s5_r3/cmd.sh
+++ b/egs/tedlium/s5_r3/cmd.sh
@@ -12,16 +12,4 @@
 # JHU cluster (or most clusters using GridEngine, with a suitable
 # conf/queue.conf).
 export train_cmd="queue.pl"
-export decode_cmd="queue.pl --mem 4G"
-
-host=$(hostname -f)
-if [ ${host#*.} == "fit.vutbr.cz" ]; then
-  queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf,
-  export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2"
-  export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1"
-  export cuda_cmd="queue.pl --config $queue_conf --gpu 1 --mem 10G --tmp 40G"
-elif [ ${host#*.} == "cm.cluster" ]; then
-  # MARCC bluecrab cluster:
-  export train_cmd="slurm.pl --time 4:00:00 "
-  export decode_cmd="slurm.pl --mem 4G --time 4:00:00 "
-fi
+export decode_cmd="queue.pl --mem 4G"
\ No newline at end of file

From 3855c7b112c564b533380f5f3ec857daeebeb76c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Wed, 23 May 2018 09:40:47 +0200
Subject: [PATCH 24/35] change rnnlm download link

---
 egs/tedlium/s5_r3/local/ted_download_rnnlm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh
index fb85be9e897..609f0194541 100755
--- a/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh
+++ b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh
@@ -10,7 +10,7 @@
 set -e
 
 echo "$0: downloading Tedlium RNNLM models (it won't re-download if it was already downloaded.)"
-wget --continue http://kaldi-asr.org/models/6/tedlium_rnnlm.tgz -P exp/rnnlm_lstm_tdnn_a_averaged || exit 1
+wget --continue http://kaldi-asr.org/models/5/tedlium_rnnlm.tgz -P exp/rnnlm_lstm_tdnn_a_averaged || exit 1
 tar -xvzf exp/rnnlm_lstm_tdnn_a_averaged/tedlium_rnnlm.tgz || exit 1
 rm exp/rnnlm_lstm_tdnn_a_averaged/tedlium_rnnlm.tgz
 

From aed82134f9a81fd58bad6c1aa256944074331826 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Thu, 24 May 2018 09:43:47 +0200
Subject: [PATCH 25/35] change tdnnf affix

---
 egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
index a5ce24454b1..1098e5598ef 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
@@ -45,7 +45,7 @@ nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
 # are just hardcoded at this level, in the commands below.
 train_stage=-10
 tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_affix=f_1a  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+tdnnf_affix=_1a  #affix for TDNNF directory, e.g. "a" or "b", in case we change the configuration.
 common_egs_dir=  # you can set this to use previously dumped egs.
 
 # End configuration section.
@@ -77,7 +77,7 @@ gmm_dir=exp/$gmm
 ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
 tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
 lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}
+dir=exp/chain${nnet3_affix}/tdnnf${tdnnf_affix}
 train_data_dir=data/${train_set}_sp_hires_comb
 lores_train_data_dir=data/${train_set}_sp_comb
 train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb

From 4a6a5072dd54eb6b7c7cb8d71850c3e458fdfb36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Thu, 24 May 2018 10:28:04 +0200
Subject: [PATCH 26/35] change chunk width tdnn

---
 egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
index 7a393db663c..c7357310b84 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
@@ -185,7 +185,7 @@ if [ $stage -le 18 ]; then
     --chain.lm-opts="--num-extra-lm-states=2000" \
     --egs.dir "$common_egs_dir" \
     --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width 150 \
+    --egs.chunk-width 150,110,100 \
     --trainer.num-chunk-per-minibatch 128 \
     --trainer.frames-per-iter 1500000 \
     --trainer.num-epochs 4 \

From 09a849aaf9a718975f392eb17e22293ae3a8c667 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Thu, 24 May 2018 10:29:52 +0200
Subject: [PATCH 27/35] update ivector common strategy

---
 .../s5_r3/local/chain/tuning/run_tdnn_1a.sh   |  10 +-
 .../s5_r3/local/chain/tuning/run_tdnnf_1a.sh  |  10 +-
 .../s5_r3/local/nnet3/run_ivector_common.sh   | 114 +++++-------------
 3 files changed, 40 insertions(+), 94 deletions(-)

diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
index c7357310b84..5e19fb5f0a0 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
@@ -63,13 +63,13 @@ local/nnet3/run_ivector_common.sh --stage $stage \
 
 
 gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+ali_dir=exp/${gmm}_ali_${train_set}_sp
 tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
 dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi_1024_ps10
-train_data_dir=data/${train_set}_sp_hires_comb
-lores_train_data_dir=data/${train_set}_sp_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
 
 
 for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
index 1098e5598ef..d4c2a0e0215 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
@@ -74,13 +74,13 @@ local/nnet3/run_ivector_common.sh --stage $stage \
 
 
 gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+ali_dir=exp/${gmm}_ali_${train_set}_sp
 tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
 dir=exp/chain${nnet3_affix}/tdnnf${tdnnf_affix}
-train_data_dir=data/${train_set}_sp_hires_comb
-lores_train_data_dir=data/${train_set}_sp_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
 
 
 for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
diff --git a/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh b/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh
index 337092b1520..5322da6240f 100755
--- a/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh
+++ b/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh
@@ -10,9 +10,7 @@ set -e -o pipefail
 
 stage=0
 nj=30
-min_seg_len=1.55  # min length in seconds... we do this because chain training
-                  # will discard segments shorter than 1.5 seconds.   Must remain in sync
-                  # with the same option given to prepare_lores_feats_and_alignments.sh
+
 train_set=train_cleaned   # you might set this to e.g. train.
 gmm=tri3_cleaned          # This specifies a GMM-dir from the features of the type you're training the system on;
                          # it should contain alignments for 'train_set'.
@@ -27,7 +25,7 @@ nnet3_affix=_cleaned     # affix for exp/nnet3 directory to put iVector stuff in
 
 
 gmm_dir=exp/${gmm}
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+ali_dir=exp/${gmm}_ali_${train_set}_sp
 
 for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
   if [ ! -f $f ]; then
@@ -79,64 +77,23 @@ if [ $stage -le 2 ]; then
 fi
 
 if [ $stage -le 3 ]; then
-  echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data"
-  # we have to combine short segments or we won't be able to train chain models
-  # on those segments.
-  utils/data/combine_short_segments.sh \
-     data/${train_set}_sp_hires $min_seg_len data/${train_set}_sp_hires_comb
-
-  # just copy over the CMVN to avoid having to recompute it.
-  cp data/${train_set}_sp_hires/cmvn.scp data/${train_set}_sp_hires_comb/
-  utils/fix_data_dir.sh data/${train_set}_sp_hires_comb/
-fi
-
-if [ $stage -le 4 ]; then
-  echo "$0: selecting segments of hires training data that were also present in the"
-  echo " ... original training data."
-
-  # note, these data-dirs are temporary; we put them in a sub-directory
-  # of the place where we'll make the alignments.
-  temp_data_root=exp/nnet3${nnet3_affix}/tri5
-  mkdir -p $temp_data_root
-
-  utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \
-          data/${train_set}_sp_hires $temp_data_root/${train_set}_hires
-
-  # note: essentially all the original segments should be in the hires data.
-  n1=$(wc -l <data/${train_set}/feats.scp)
-  n2=$(wc -l <$temp_data_root/${train_set}_hires/feats.scp)
-  if [ $n1 != $n1 ]; then
-    echo "$0: warning: number of feats $n1 != $n2, if these are very different it could be bad."
-  fi
-
-  echo "$0: training a system on the hires data for its LDA+MLLT transform, in order to produce the diagonal GMM."
-  if [ -e exp/nnet3${nnet3_affix}/tri5/final.mdl ]; then
-    # we don't want to overwrite old stuff, ask the user to delete it.
-    echo "$0: exp/nnet3${nnet3_affix}/tri5/final.mdl already exists: "
-    echo " ... please delete and then rerun, or use a later --stage option."
-    exit 1;
-  fi
-  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 7 --mllt-iters "2 4 6" \
-     --splice-opts "--left-context=3 --right-context=3" \
-     3000 10000 $temp_data_root/${train_set}_hires data/lang \
-      $gmm_dir exp/nnet3${nnet3_affix}/tri5
-fi
-
-
-if [ $stage -le 5 ]; then
   echo "$0: computing a subset of data to train the diagonal UBM."
 
   mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
   temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
 
   # train a diagonal UBM using a subset of about a quarter of the data
-  # we don't use the _comb data for this as there is no need for compatibility with
-  # the alignments, and using the non-combined data is more efficient for I/O
-  # (no messing about with piped commands).
   num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
   num_utts=$[$num_utts_total/4]
   utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
-      $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+    $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=3 --right-context=3" \
+    --max-utts 10000 --subsample 2 \
+    ${temp_data_root}/${train_set}_sp_hires_subset \
+    exp/nnet3${nnet3_affix}/pca_transform
 
   echo "$0: training the diagonal UBM."
   # Use 512 Gaussians in the UBM.
@@ -144,40 +101,40 @@ if [ $stage -le 5 ]; then
     --num-frames 700000 \
     --num-threads $num_threads_ubm \
     ${temp_data_root}/${train_set}_sp_hires_subset 512 \
-    exp/nnet3${nnet3_affix}/tri5 exp/nnet3${nnet3_affix}/diag_ubm
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
 fi
 
-if [ $stage -le 6 ]; then
-  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
-  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
-  # 100.
+if [ $stage -le 4 ]; then
+  # Train the iVector extractor. µUse all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data. The script defaults to an iVector dimension of 100.
   echo "$0: training the iVector extractor"
   steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
-    data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
+    data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \
+    exp/nnet3${nnet3_affix}/extractor || exit 1;
 fi
 
-if [ $stage -le 7 ]; then
+if [ $stage -le 5 ]; then
   # note, we don't encode the 'max2' in the name of the ivectordir even though
   # that's the data we extract the ivectors from, as it's still going to be
   # valid for the non-'max2' data, the utterance list is the same.
-  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
     utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/tedlium-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
   fi
-  # We extract iVectors on the speed-perturbed training data after combining
-  # short segments, which will be what we train the system on.  With
+  # We now extract iVectors on the speed-perturbed training data .  With
   # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
   # each of these pairs as one speaker; this gives more diversity in iVectors..
-  # Note that these are extracted 'online'.
+  # Note that these are extracted 'online' (they vary within the utterance).
 
-  # having a larger number of speakers is helpful for generalization, and to
-  # handle per-utterance decoding well (iVector starts at zero).
+  # Having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (the iVector starts at zero at the beginning
+  # of each pseudo-speaker).
   temp_data_root=${ivectordir}
   utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
-    data/${train_set}_sp_hires_comb ${temp_data_root}/${train_set}_sp_hires_comb_max2
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
 
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
-    ${temp_data_root}/${train_set}_sp_hires_comb_max2 \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
     exp/nnet3${nnet3_affix}/extractor $ivectordir
 
   # Also extract iVectors for the test data, but in this case we don't need the speed
@@ -196,13 +153,13 @@ if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 9 ]; then
 fi
 
 
-if [ $stage -le 8 ]; then
+if [ $stage -le 6 ]; then
   echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
   utils/data/perturb_data_dir_speed_3way.sh \
     data/${train_set} data/${train_set}_sp
 fi
 
-if [ $stage -le 9 ]; then
+if [ $stage -le 7 ]; then
   echo "$0: making MFCC features for low-resolution speed-perturbed data"
   steps/make_mfcc.sh --nj $nj \
     --cmd "$train_cmd" data/${train_set}_sp
@@ -212,26 +169,15 @@ if [ $stage -le 9 ]; then
   utils/fix_data_dir.sh data/${train_set}_sp
 fi
 
-if [ $stage -le 10 ]; then
-  echo "$0: combining short segments of low-resolution speed-perturbed  MFCC data"
-  src=data/${train_set}_sp
-  dest=data/${train_set}_sp_comb
-  utils/data/combine_short_segments.sh $src $min_seg_len $dest
-  # re-use the CMVN stats from the source directory, since it seems to be slow to
-  # re-compute them after concatenating short segments.
-  cp $src/cmvn.scp $dest/
-  utils/fix_data_dir.sh $dest
-fi
-
-if [ $stage -le 11 ]; then
+if [ $stage -le 8 ]; then
   if [ -f $ali_dir/ali.1.gz ]; then
     echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
     echo " ... or use a later --stage option."
     exit 1
   fi
-  echo "$0: aligning with the perturbed, short-segment-combined low-resolution data"
+  echo "$0: aligning with the perturbed low-resolution data"
   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-         data/${train_set}_sp_comb data/lang $gmm_dir $ali_dir
+         data/${train_set}_sp data/lang $gmm_dir $ali_dir
 fi
 
 

From 82706fdc32205b6055230f6aabb2775e071a6057 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francoish.hernandez.fh@gmail.com>
Date: Thu, 24 May 2018 10:31:32 +0200
Subject: [PATCH 28/35] remove bi suffix and small fix

---
 egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh  | 4 ++--
 egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
index 5e19fb5f0a0..e6613529ab6 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
@@ -64,9 +64,9 @@ local/nnet3/run_ivector_common.sh --stage $stage \
 
 gmm_dir=exp/$gmm
 ali_dir=exp/${gmm}_ali_${train_set}_sp
-tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+tree_dir=exp/chain${nnet3_affix}/tree${tree_affix}
 lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
-dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi_1024_ps10
+dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp
 train_data_dir=data/${train_set}_sp_hires
 lores_train_data_dir=data/${train_set}_sp
 train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
index d4c2a0e0215..cb468748b47 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
@@ -75,7 +75,7 @@ local/nnet3/run_ivector_common.sh --stage $stage \
 
 gmm_dir=exp/$gmm
 ali_dir=exp/${gmm}_ali_${train_set}_sp
-tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+tree_dir=exp/chain${nnet3_affix}/tree${tree_affix}
 lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
 dir=exp/chain${nnet3_affix}/tdnnf${tdnnf_affix}
 train_data_dir=data/${train_set}_sp_hires

From 27067fc9cb69e24cc735b7bc1e91f870d87fceb7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francois.hernandez.fh@gmail.com>
Date: Wed, 6 Jun 2018 10:58:21 +0200
Subject: [PATCH 29/35] fix join_suffix stm data prep, add scoring scripts

---
 egs/tedlium/s5_r3/local/prepare_data.sh |  5 +-
 egs/tedlium/s5_r3/local/score.sh        |  2 +-
 egs/tedlium/s5_r3/local/score_basic.sh  | 55 ++++++++++++++
 egs/tedlium/s5_r3/local/score_sclite.sh | 96 +++++++++++++++++++++++++
 egs/tedlium/s5_r3/local/ted_train_lm.sh |  2 +-
 5 files changed, 154 insertions(+), 6 deletions(-)
 create mode 100755 egs/tedlium/s5_r3/local/score_basic.sh
 create mode 100755 egs/tedlium/s5_r3/local/score_sclite.sh

diff --git a/egs/tedlium/s5_r3/local/prepare_data.sh b/egs/tedlium/s5_r3/local/prepare_data.sh
index c8a9e0a8665..8de1752742b 100755
--- a/egs/tedlium/s5_r3/local/prepare_data.sh
+++ b/egs/tedlium/s5_r3/local/prepare_data.sh
@@ -13,9 +13,6 @@
 
 export LC_ALL=C
 
-# Prepare LM data
-gunzip -c db/TEDLIUM_release-3/LM/*.en.gz | sed 's/ <\/s>//g' | gzip -c  > data/LM/train.txt
-
 # Prepare: test, train,
 for set in dev test train; do
   dir=data/$set.orig
@@ -40,7 +37,7 @@ for set in dev test train; do
     cat db/TEDLIUM_release-3/legacy/$set/stm/*.stm | sort -k1,1 -k2,2 -k4,4n | \
       sed -e 's:([^ ]*)$::' | \
       awk '{ $2 = "A"; print $0; }'
-  } > data/$set.orig/stm
+  } | local/join_suffix.py > data/$set.orig/stm
 
   # Prepare 'text' file
   # - {NOISE} -> [NOISE] : map the tags to match symbols in dictionary
diff --git a/egs/tedlium/s5_r3/local/score.sh b/egs/tedlium/s5_r3/local/score.sh
index d89286dc25a..f2835abb6d9 120000
--- a/egs/tedlium/s5_r3/local/score.sh
+++ b/egs/tedlium/s5_r3/local/score.sh
@@ -1 +1 @@
-score_sclite.sh
\ No newline at end of file
+score_basic.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r3/local/score_basic.sh b/egs/tedlium/s5_r3/local/score_basic.sh
new file mode 100755
index 00000000000..47b57396c64
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/score_basic.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+min_lmwt=7
+max_lmwt=17
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/scoring/log
+
+cat $data/text > $dir/scoring/test_filt.txt
+
+$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
+  lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \
+    "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1;
+
+# Note: the double level of quoting for the sed command
+
+$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
+   cat $dir/scoring/LMWT.tra \| \
+    utils/int2sym.pl -f 2- $symtab \| \
+    sed "'s:<unk>::g'" \| \
+    compute-wer --text --mode=present \
+     ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT || exit 1;
+
+# Show results
+for f in $dir/wer_*; do echo $f; egrep  '(WER)|(SER)' < $f; done
+
+exit 0;
diff --git a/egs/tedlium/s5_r3/local/score_sclite.sh b/egs/tedlium/s5_r3/local/score_sclite.sh
new file mode 100755
index 00000000000..16c8b30e52f
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/score_sclite.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+#
+# Copyright Johns Hopkins University (Author: Daniel Povey) 2012,
+#           Brno University of Technology (Author: Karel Vesely) 2014,
+# Apache 2.0
+#
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+decode_mbr=true
+beam=7  # speed-up, but may affect MBR confidences.
+word_ins_penalty=0.0,0.5,1.0
+min_lmwt=7
+max_lmwt=17
+iter=final
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score_sclite.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+dir=$3
+
+model=$dir/../$iter.mdl # assume model one level up from decoding dir.
+
+hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
+[ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
+hubdir=`dirname $hubscr`
+
+for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \
+     $model $data/segments $data/reco2file_and_channel $dir/lat.1.gz; do
+  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+done
+
+# name=`basename $data`; # e.g. eval2000
+nj=$(cat $dir/num_jobs)
+
+mkdir -p $dir/scoring/log
+
+if [ -f $dir/../frame_shift ]; then
+  frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)"
+  echo "$0: $dir/../frame_shift exists, using $frame_shift_opt"
+elif [ -f $dir/../frame_subsampling_factor ]; then
+  factor=$(cat $dir/../frame_subsampling_factor) || exit 1
+  frame_shift_opt="--frame-shift=0.0$factor"
+  echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
+fi
+
+if [ $stage -le 0 ]; then
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \
+      set -e -o pipefail \; \
+      mkdir -p $dir/score_LMWT_${wip}/ '&&' \
+      lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+      lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+      lattice-prune --beam=$beam ark:- ark:- \| \
+      lattice-align-words --output-error-lats=true --max-expand=10.0 --test=false \
+       $lang/phones/word_boundary.int $model ark:- ark:- \| \
+      lattice-to-ctm-conf --decode-mbr=$decode_mbr $frame_shift_opt ark:- - \| \
+      utils/int2sym.pl -f 5 $lang/words.txt \| \
+      utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \| \
+      sort -k1,1 -k2,2 -k3,3nb '>' $dir/score_LMWT_${wip}/ctm || exit 1;
+  done
+fi
+
+if [ $stage -le 1 ]; then
+  # Remove some stuff we don't want to score, from the ctm.
+  for x in $dir/score_*/ctm; do
+    # `-i` is not needed in the following. It is added for robustness in ase this code is copy-pasted
+    # into another script that, e.g., uses <UNK> instead of <unk>
+    grep -v -w -i '<unk>' <$x > ${x}.filt || exit 1;
+  done
+fi
+
+# Score the set...
+if [ $stage -le 2 ]; then
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.${wip}.log \
+      cp $data/stm $dir/score_LMWT_${wip}/ '&&' \
+      $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm $dir/score_LMWT_${wip}/ctm.filt || exit 1;
+  done
+fi
+
+exit 0
diff --git a/egs/tedlium/s5_r3/local/ted_train_lm.sh b/egs/tedlium/s5_r3/local/ted_train_lm.sh
index 20ea2ca3216..3c587f63094 100755
--- a/egs/tedlium/s5_r3/local/ted_train_lm.sh
+++ b/egs/tedlium/s5_r3/local/ted_train_lm.sh
@@ -58,7 +58,7 @@ if [ $stage -le 0 ]; then
 
   rm ${dir}/data/text/* 2>/dev/null || true
 
-  # Unzip TEDLIUM 6 data sources, normalize apostrophe+suffix to previous word, gzip the result.
+  # Unzip TEDLIUM 6 data sources, remove </s>, gzip the result.
   gunzip -c db/TEDLIUM_release-3/LM/*.en.gz | sed 's/ <\/s>//g' | gzip -c  > ${dir}/data/text/train.txt.gz
   # use a subset of the annotated training data as the dev set .
   # Note: the name 'dev' is treated specially by pocolm, it automatically

From 677f7542e66fa4d7b496d9ca7ff2e3420f967d23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francois.hernandez.fh@gmail.com>
Date: Fri, 22 Jun 2018 12:17:15 +0200
Subject: [PATCH 30/35] fix ted_download_rnnlm script

---
 egs/tedlium/s5_r3/local/ted_download_rnnlm.sh | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh
index 609f0194541..431d44c6ff6 100755
--- a/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh
+++ b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh
@@ -11,7 +11,12 @@ set -e
 
 echo "$0: downloading Tedlium RNNLM models (it won't re-download if it was already downloaded.)"
 wget --continue http://kaldi-asr.org/models/5/tedlium_rnnlm.tgz -P exp/rnnlm_lstm_tdnn_a_averaged || exit 1
-tar -xvzf exp/rnnlm_lstm_tdnn_a_averaged/tedlium_rnnlm.tgz || exit 1
-rm exp/rnnlm_lstm_tdnn_a_averaged/tedlium_rnnlm.tgz
+cd exp/rnnlm_lstm_tdnn_a_averaged
+tar -xvzf tedlium_rnnlm.tgz || exit 1
+rm tedlium_rnnlm.tgz
+mkdir config
+cd ../..
+cp data/lang/words.txt exp/rnnlm_lstm_tdnn_a_averaged/config/words.txt
+echo "<brk> 152217" >> exp/rnnlm_lstm_tdnn_a_averaged/config/words.txt
 
-exit 0
\ No newline at end of file
+exit 0

From b1d93006a1d60551ace9d77503094c026e7d6cbc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francois.hernandez.fh@gmail.com>
Date: Fri, 22 Jun 2018 12:17:32 +0200
Subject: [PATCH 31/35] fix rnnlm rescoring in run.sh

---
 egs/tedlium/s5_r3/run.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/tedlium/s5_r3/run.sh b/egs/tedlium/s5_r3/run.sh
index 74904fbd1ac..d4f3a38fd49 100755
--- a/egs/tedlium/s5_r3/run.sh
+++ b/egs/tedlium/s5_r3/run.sh
@@ -206,8 +206,8 @@ if [ $stage -le 19 ]; then
   ngram_order=4
 
   for dset in dev test; do
-    data_dir=data/${set}_hires
-    decoding_dir=exp/chain/tdnnf_1a
+    data_dir=data/${dset}_hires
+    decoding_dir=exp/chain_cleaned/tdnnf_1a
     suffix=$(basename $rnnlm_dir)
     output_dir=${decoding_dir}_$suffix
 

From bf6807154e58f1f55fa27d43c9bcaee7bd73581b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francois.hernandez.fh@gmail.com>
Date: Fri, 22 Jun 2018 12:18:23 +0200
Subject: [PATCH 32/35] add both sclite and score_basic scores in tdnnf script

---
 .../s5_r3/local/chain/tuning/run_tdnnf_1a.sh  | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
index cb468748b47..d807c636ace 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
@@ -4,15 +4,18 @@
 # It use 2 to 6 jobs and add proportional-shrink 10.
 
 # local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_1a exp/chain_cleaned/tdnnf_1a
-# System                    tdnn_1a   tdnnf_1a
-# WER on dev(orig)            8.2       7.9
-# WER on dev(rescored)        7.6       7.2
-# WER on test(orig)           8.1       8.0
-# WER on test(rescored)       7.7       7.5
-# Final train prob          -0.0802   -0.0779
-# Final valid prob          -0.0980   -0.0906
-# Final train prob (xent)   -1.1450   -0.9021
-# Final valid prob (xent)   -1.2498   -0.9971
+# System                      tdnn_1a   tdnnf_1a   tdnnf_1a
+# Scoring script	       sclite    sclite   score_basic
+# WER on dev(orig)              8.2       7.9         7.9
+# WER on dev(rescored ngram)    7.6       7.4         7.5
+# WER on dev(rescored rnnlm)    6.3       6.2         6.2
+# WER on test(orig)             8.1       8.0         8.2
+# WER on test(rescored ngram)   7.7       7.7         7.9
+# WER on test(rescored rnnlm)   6.7       6.7         6.8
+# Final train prob            -0.0802   -0.0899
+# Final valid prob            -0.0980   -0.0974
+# Final train prob (xent)     -1.1450   -0.9449
+# Final valid prob (xent)     -1.2498   -1.0002
 
 
 ## how you run this (note: this assumes that the run_tdnn.sh soft link points here;
@@ -64,6 +67,7 @@ where "nvcc" is installed.
 EOF
 fi
 
+
 local/nnet3/run_ivector_common.sh --stage $stage \
                                   --nj $nj \
                                   --min-seg-len $min_seg_len \
@@ -217,8 +221,6 @@ if [ $stage -le 18 ]; then
     --dir $dir
 fi
 
-
-
 if [ $stage -le 19 ]; then
   # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from

From 75e9d60ca5982ac27ba3e2d94fd97bc540f972cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francois.hernandez.fh@gmail.com>
Date: Thu, 28 Jun 2018 12:39:30 +0200
Subject: [PATCH 33/35] some fix to tdnn scripts

---
 egs/tedlium/s5_r3/cmd.sh                      |   2 +-
 .../s5_r3/local/chain/compare_wer_general.sh  | 111 ++++++++
 .../s5_r3/local/chain/tuning/run_tdnn_1a.sh   |  18 +-
 .../s5_r3/local/chain/tuning/run_tdnn_1b.sh   | 251 ++++++++++++++++++
 4 files changed, 380 insertions(+), 2 deletions(-)
 create mode 100755 egs/tedlium/s5_r3/local/chain/compare_wer_general.sh
 create mode 100755 egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh

diff --git a/egs/tedlium/s5_r3/cmd.sh b/egs/tedlium/s5_r3/cmd.sh
index 23a2b7b6a51..56c1d783a9e 100755
--- a/egs/tedlium/s5_r3/cmd.sh
+++ b/egs/tedlium/s5_r3/cmd.sh
@@ -12,4 +12,4 @@
 # JHU cluster (or most clusters using GridEngine, with a suitable
 # conf/queue.conf).
 export train_cmd="queue.pl"
-export decode_cmd="queue.pl --mem 4G"
\ No newline at end of file
+export decode_cmd="queue.pl --mem 4G"
diff --git a/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh b/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh
new file mode 100755
index 00000000000..88dde1ff0e2
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_c_sp exp/chain_cleaned/tdnn_c_sp_smbr:{1,2,3}
+
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain_cleaned/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain_cleaned/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=("# WER on dev(orig)     " "# WER on dev(rescored) " "# WER on test(orig)    " "# WER on test(rescored)")
+
+for n in 0 1 2 3; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+     decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
+     wer=$(grep Sum $dirname/decode_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#         [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
+       wer=$(grep Sum $dirname/decode_looped_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Num-params              "
+for x in $*; do
+  printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}')
+done
+echo
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
index e6613529ab6..40cdcb5b5ff 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
@@ -1,6 +1,22 @@
 #!/bin/bash
 
-# See run_tdnnf_1a.sh for comparative results.
+# Results
+
+# System                      tdnn_1a
+# Scoring script              sclite
+# WER on dev(orig)              8.2
+# WER on dev(rescored ngram)    7.6
+# WER on dev(rescored rnnlm)    6.3
+# WER on test(orig)             8.1
+# WER on test(rescored ngram)   7.7
+# WER on test(rescored rnnlm)   6.7
+# Final train prob            -0.0802
+# Final valid prob            -0.0980
+# Final train prob (xent)     -1.1450
+# Final valid prob (xent)     -1.2498
+# Num-params                  26651840
+
+
 
 ## how you run this (note: this assumes that the run_tdnn.sh soft link points here;
 ## otherwise call it directly in its location).
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..2d1506f713c
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
@@ -0,0 +1,251 @@
+#!/bin/bash
+
+# run_tdnn_1b.sh is the script which results are presented in the corpus release paper.
+# It use 2 to 6 jobs and add proportional-shrink 10.
+
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_1a exp/chain_cleaned/tdnn_1b
+# System                      tdnn_1a   tdnn_1b   tdnn_1b
+# Scoring script	            sclite    sclite   score_basic
+# WER on dev(orig)              8.2       7.9         7.9
+# WER on dev(rescored ngram)    7.6       7.4         7.5
+# WER on dev(rescored rnnlm)    6.3       6.2         6.2
+# WER on test(orig)             8.1       8.0         8.2
+# WER on test(rescored ngram)   7.7       7.7         7.9
+# WER on test(rescored rnnlm)   6.7       6.7         6.8
+# Final train prob            -0.0802   -0.0899
+# Final valid prob            -0.0980   -0.0974
+# Final train prob (xent)     -1.1450   -0.9449
+# Final valid prob (xent)     -1.2498   -1.0002
+# Num-params                  26651840  25782720
+
+
+## how you run this (note: this assumes that the run_tdnn.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn.sh
+
+# without cleanup:
+# local/chain/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run the corresponding non-chain nnet3 system
+# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnnf_affix=_1a  #affix for TDNNF directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnnf${tdnnf_affix}
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=1280
+  linear-component name=tdnn2l dim=256 input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn2 input=Append(0,1) dim=1280
+  linear-component name=tdnn3l dim=256
+  relu-batchnorm-layer name=tdnn3 dim=1280
+  linear-component name=tdnn4l dim=256 input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn4 input=Append(0,1) dim=1280
+  linear-component name=tdnn5l dim=256
+  relu-batchnorm-layer name=tdnn5 dim=1280 input=Append(tdnn5l, tdnn3l)
+  linear-component name=tdnn6l dim=256 input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn6 input=Append(0,3) dim=1280
+  linear-component name=tdnn7l dim=256 input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn7 input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280
+  linear-component name=tdnn8l dim=256 input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn8 input=Append(0,3) dim=1280
+  linear-component name=tdnn9l dim=256 input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn9 input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280
+  linear-component name=tdnn10l dim=256 input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn10 input=Append(0,3) dim=1280
+  linear-component name=tdnn11l dim=256 input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn11 input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280
+  linear-component name=prefinal-l dim=256
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l dim=1280
+  output-layer name=output include-log-softmax=false dim=$num_targets
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l dim=1280
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+
+fi
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.proportional-shrink 10 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 6 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs false \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0

From 6095425e2ea886e5fc6b7fe1f5f7e66134151a42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francois.hernandez.fh@gmail.com>
Date: Thu, 28 Jun 2018 12:43:26 +0200
Subject: [PATCH 34/35] minor fix preparation scripts

---
 egs/tedlium/s5_r3/local/prepare_data.sh | 2 --
 egs/tedlium/s5_r3/local/prepare_dict.sh | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/egs/tedlium/s5_r3/local/prepare_data.sh b/egs/tedlium/s5_r3/local/prepare_data.sh
index 8de1752742b..c4b911601e5 100755
--- a/egs/tedlium/s5_r3/local/prepare_data.sh
+++ b/egs/tedlium/s5_r3/local/prepare_data.sh
@@ -11,8 +11,6 @@
 
 . ./path.sh
 
-export LC_ALL=C
-
 # Prepare: test, train,
 for set in dev test train; do
   dir=data/$set.orig
diff --git a/egs/tedlium/s5_r3/local/prepare_dict.sh b/egs/tedlium/s5_r3/local/prepare_dict.sh
index 3cdbcb3fdf6..204b3f910e5 100755
--- a/egs/tedlium/s5_r3/local/prepare_dict.sh
+++ b/egs/tedlium/s5_r3/local/prepare_dict.sh
@@ -10,7 +10,7 @@
 dir=data/local/dict_nosp
 mkdir -p $dir
 
-srcdict=db//TEDLIUM_release-3/TEDLIUM.152k.dic
+srcdict=db/TEDLIUM_release-3/TEDLIUM.152k.dic
 
 [ ! -r $srcdict ] && echo "Missing $srcdict" && exit 1
 

From 285b3896dac0c65a7ec6cf1dc7deed4179d09198 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Hernandez?=
 <francois.hernandez.fh@gmail.com>
Date: Thu, 12 Jul 2018 21:22:10 +0200
Subject: [PATCH 35/35] add warning tdnnf setup

---
 egs/tedlium/s5_r3/local/chain/run_tdnnf.sh    |   2 +-
 .../s5_r3/local/chain/tuning/run_tdnn_1b.sh   |   8 +-
 .../s5_r3/local/chain/tuning/run_tdnnf_1a.sh  | 250 ------------------
 3 files changed, 8 insertions(+), 252 deletions(-)
 delete mode 100755 egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh

diff --git a/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh b/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh
index cbbf0ed6533..61f8f499182 120000
--- a/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh
+++ b/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh
@@ -1 +1 @@
-tuning/run_tdnnf_1a.sh
\ No newline at end of file
+tuning/run_tdnn_1b.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
index 2d1506f713c..f8eec8c5213 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
@@ -1,7 +1,13 @@
 #!/bin/bash
 
 # run_tdnn_1b.sh is the script which results are presented in the corpus release paper.
-# It use 2 to 6 jobs and add proportional-shrink 10.
+# It uses 2 to 6 jobs and add proportional-shrink 10.
+
+# WARNING
+# This script is flawed and misses key elements to optimize the tdnnf setup.
+# You can run it as is to reproduce results from the corpus release paper,
+# but a more up-to-date version should be looked at in other egs until another
+# setup is added here.
 
 # local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_1a exp/chain_cleaned/tdnn_1b
 # System                      tdnn_1a   tdnn_1b   tdnn_1b
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
deleted file mode 100755
index d807c636ace..00000000000
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnnf_1a.sh
+++ /dev/null
@@ -1,250 +0,0 @@
-#!/bin/bash
-
-# run_tdnnf_1a.sh is the script which results are presented in the corpus release paper.
-# It use 2 to 6 jobs and add proportional-shrink 10.
-
-# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_1a exp/chain_cleaned/tdnnf_1a
-# System                      tdnn_1a   tdnnf_1a   tdnnf_1a
-# Scoring script	       sclite    sclite   score_basic
-# WER on dev(orig)              8.2       7.9         7.9
-# WER on dev(rescored ngram)    7.6       7.4         7.5
-# WER on dev(rescored rnnlm)    6.3       6.2         6.2
-# WER on test(orig)             8.1       8.0         8.2
-# WER on test(rescored ngram)   7.7       7.7         7.9
-# WER on test(rescored rnnlm)   6.7       6.7         6.8
-# Final train prob            -0.0802   -0.0899
-# Final valid prob            -0.0980   -0.0974
-# Final train prob (xent)     -1.1450   -0.9449
-# Final valid prob (xent)     -1.2498   -1.0002
-
-
-## how you run this (note: this assumes that the run_tdnn.sh soft link points here;
-## otherwise call it directly in its location).
-# by default, with cleanup:
-# local/chain/run_tdnn.sh
-
-# without cleanup:
-# local/chain/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
-
-# note, if you have already run the corresponding non-chain nnet3 system
-# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
-
-
-set -e -o pipefail
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=0
-nj=30
-decode_nj=30
-min_seg_len=1.55
-xent_regularize=0.1
-train_set=train_cleaned
-gmm=tri3_cleaned  # the gmm for the target data
-num_threads_ubm=32
-nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-
-# The rest are configs specific to this script.  Most of the parameters
-# are just hardcoded at this level, in the commands below.
-train_stage=-10
-tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnnf_affix=_1a  #affix for TDNNF directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=  # you can set this to use previously dumped egs.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                                  --nnet3-affix "$nnet3_affix"
-
-
-gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp
-tree_dir=exp/chain${nnet3_affix}/tree${tree_affix}
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
-dir=exp/chain${nnet3_affix}/tdnnf${tdnnf_affix}
-train_data_dir=data/${train_set}_sp_hires
-lores_train_data_dir=data/${train_set}_sp
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
-
-
-for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-if [ $stage -le 14 ]; then
-  echo "$0: creating lang directory with one state per phone."
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  if [ -d data/lang_chain ]; then
-    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
-      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
-    else
-      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
-      echo " ... not sure what to do.  Exiting."
-      exit 1;
-    fi
-  else
-    cp -r data/lang data/lang_chain
-    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
-    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
-    # Use our special topology... note that later on may have to tune this
-    # topology.
-    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
-  fi
-fi
-
-if [ $stage -le 15 ]; then
-  # Get the alignments as lattices (gives the chain training more freedom).
-  # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang $gmm_dir $lat_dir
-  rm $lat_dir/fsts.*.gz # save space
-fi
-
-if [ $stage -le 16 ]; then
-  # Build a tree using our new topology.  We know we have alignments for the
-  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
-  # those.
-  if [ -f $tree_dir/final.mdl ]; then
-    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
-    exit 1;
-  fi
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
-fi
-
-if [ $stage -le 17 ]; then
-  mkdir -p $dir
-
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-
-  mkdir -p $dir/configs
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 $opts dim=1280
-  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
-  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280
-  linear-component name=tdnn3l dim=256 $linear_opts
-  relu-batchnorm-layer name=tdnn3 $opts dim=1280
-  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
-  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280
-  linear-component name=tdnn5l dim=256 $linear_opts
-  relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l)
-  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280
-  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280
-  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280
-  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280
-  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280
-  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280
-  linear-component name=prefinal-l dim=256 $linear_opts
-  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280
-  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
-  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
-
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-
-fi
-
-if [ $stage -le 18 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
- steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "$decode_cmd" \
-    --feat.online-ivector-dir $train_ivector_dir \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize 0.1 \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.00005 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width 150 \
-    --trainer.num-chunk-per-minibatch 128 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.proportional-shrink 10 \
-    --trainer.optimization.num-jobs-initial 2 \
-    --trainer.optimization.num-jobs-final 6 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs false \
-    --feat-dir $train_data_dir \
-    --tree-dir $tree_dir \
-    --lat-dir $lat_dir \
-    --dir $dir
-fi
-
-if [ $stage -le 19 ]; then
-  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
-fi
-
-if [ $stage -le 20 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for dset in dev test; do
-      (
-      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-          --scoring-opts "--min-lmwt 5 " \
-         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-exit 0