diff --git a/.gitignore b/.gitignore
index 910d5cb019d..4cf0fa4efa9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -83,6 +83,7 @@ GSYMS
 /tools/ATLAS/
 /tools/atlas3.8.3.tar.gz
 /tools/irstlm/
+/tools/mitlm/
 /tools/openfst
 /tools/openfst-1.3.2.tar.gz
 /tools/openfst-1.3.2/
diff --git a/.travis.yml b/.travis.yml
index 23507297413..51e49653efc 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -49,7 +49,7 @@ script:
   # for the explanation why extra switches needed for clang with ccache.
   - CXX="ccache clang++-3.8 -Qunused-arguments -fcolor-diagnostics -Wno-tautological-compare"
     CFLAGS=""
-    LDFLAGS="-llapack"
+    LDFLAGS="-llapack -Wl,-fuse-ld=gold"
     INCDIRS="$XROOT/usr/include"
     LIBDIRS="$XROOT/usr/lib"
       tools/extras/travis_script.sh
diff --git a/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
index 3157d7ffec7..7112e0259a0 100755
--- a/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
+++ b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
@@ -87,18 +87,15 @@ sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
 join $dir/utt2spk $dir/segments | \
    perl -ne '{BEGIN{$pu=""; $pt=0.0;} split;
            if ($pu eq $_[1] && $pt > $_[3]) {
-             print "$_[0] $_[2] $_[3] $_[4]>$_[0] $_[2] $pt $_[4]\n"
+             print "s/^$_[0] $_[2] $_[3] $_[4]\$/$_[0] $_[2] $pt $_[4]/;\n"
            }
-           $pu=$_[1]; $pt=$_[4]; 
+           $pu=$_[1]; $pt=$_[4];
          }' > $dir/segments_to_fix
-if [ `cat $dir/segments_to_fix | wc -l` -gt 0 ]; then
+
+if [ -s $dir/segments_to_fix ]; then
   echo "$0. Applying following fixes to segments"
   cat $dir/segments_to_fix
-  while read line; do
-     p1=`echo $line | awk -F'>' '{print $1}'`
-     p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s!$p1!$p2!" $dir/segments
-  done < $dir/segments_to_fix
+  perl -i -pf $dir/segments_to_fix $dir/segments
 fi
 
 # Copy stuff into its final locations
diff --git a/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh b/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh
index 4cfa9110edf..9c4b55308f2 100755
--- a/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh
+++ b/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh
@@ -94,19 +94,15 @@ awk '{print $1}' $tmpdir/segments | \
 join $tmpdir/utt2spk_stm $tmpdir/segments | \
   awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5;
          if(spk_prev == spk && t_end_prev > t_beg) {
-           print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end;
+           print "s/^"utt, wav, t_beg, t_end"$/"utt, wav, t_end_prev, t_end"/;";
          }
          spk_prev=spk; t_end_prev=t_end;
        }' > $tmpdir/segments_to_fix
 
-if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then
+if [ -s $tmpdir/segments_to_fix ]; then
   echo "$0. Applying following fixes to segments"
   cat $tmpdir/segments_to_fix
-  while read line; do
-     p1=`echo $line | awk -F'>' '{print $1}'`
-     p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s:$p1:$p2:" $tmpdir/segments
-  done < $tmpdir/segments_to_fix
+  perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments
 fi
 
 # Copy stuff into its final locations [this has been moved from the format_data
diff --git a/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
index 91baa37d6e1..815e1b2d270 100755
--- a/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
+++ b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
@@ -101,19 +101,15 @@ awk '{print $1}' $tmpdir/segments | \
 join $tmpdir/utt2spk_stm $tmpdir/segments | \
   awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5;
          if(spk_prev == spk && t_end_prev > t_beg) {
-           print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end;
+           print "s/^"utt, wav, t_beg, t_end"$/"utt, wav, t_end_prev, t_end"/;";
          }
          spk_prev=spk; t_end_prev=t_end;
        }' > $tmpdir/segments_to_fix
 
-if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then
+if [ -s $tmpdir/segments_to_fix ]; then
   echo "$0. Applying following fixes to segments"
   cat $tmpdir/segments_to_fix
-  while read line; do
-     p1=`echo $line | awk -F'>' '{print $1}'`
-     p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s:$p1:$p2:" $tmpdir/segments
-  done < $tmpdir/segments_to_fix
+  perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments
 fi
 
 # Copy stuff into its final locations [this has been moved from the format_data
diff --git a/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh
index 746c42c4c1a..c54876331f1 100755
--- a/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh
+++ b/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh
@@ -93,18 +93,15 @@ sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
 join $dir/utt2spk $dir/segments | \
    perl -ne '{BEGIN{$pu=""; $pt=0.0;} split;
            if ($pu eq $_[1] && $pt > $_[3]) {
-             print "$_[0] $_[2] $_[3] $_[4]>$_[0] $_[2] $pt $_[4]\n"
+             print "s/^$_[0] $_[2] $_[3] $_[4]\$/$_[0] $_[2] $pt $_[4]/;\n"
            }
            $pu=$_[1]; $pt=$_[4];
          }' > $dir/segments_to_fix
-if [ `cat $dir/segments_to_fix | wc -l` -gt 0 ]; then
+
+if [ -s $dir/segments_to_fix ]; then
   echo "$0. Applying following fixes to segments"
   cat $dir/segments_to_fix
-  while read line; do
-     p1=`echo $line | awk -F'>' '{print $1}'`
-     p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s!$p1!$p2!" $dir/segments
-  done < $dir/segments_to_fix
+  perl -i -pf $dir/segments_to_fix $dir/segments
 fi
 
 # Copy stuff into its final locations
diff --git a/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh
index 65f514f223c..475ef5405ba 100755
--- a/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh
+++ b/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh
@@ -99,19 +99,15 @@ awk '{print $1}' $tmpdir/segments | \
 join $tmpdir/utt2spk_stm $tmpdir/segments | \
   awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5;
          if(spk_prev == spk && t_end_prev > t_beg) {
-           print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end;
+           print "s/^"utt, wav, t_beg, t_end"$/"utt, wav, t_end_prev, t_end"/;";
          }
          spk_prev=spk; t_end_prev=t_end;
        }' > $tmpdir/segments_to_fix
 
-if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then
+if [ -s $tmpdir/segments_to_fix ]; then
   echo "$0. Applying following fixes to segments"
   cat $tmpdir/segments_to_fix
-  while read line; do
-     p1=`echo $line | awk -F'>' '{print $1}'`
-     p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s:$p1:$p2:" $tmpdir/segments
-  done < $tmpdir/segments_to_fix
+  perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments
 fi
 
 # Copy stuff into its final locations [this has been moved from the format_data
diff --git a/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh
index 1378f8b8965..d7ce038c0a7 100755
--- a/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh
+++ b/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh
@@ -111,25 +111,21 @@ awk '{print $1}' $tmpdir/segments | \
 join $tmpdir/utt2spk_stm $tmpdir/segments | \
   awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5;
          if(spk_prev == spk && t_end_prev > t_beg) {
-           print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end;
+           print "s/^"utt, wav, t_beg, t_end"$/"utt, wav, t_end_prev, t_end"/;";
          }
          spk_prev=spk; t_end_prev=t_end;
        }' > $tmpdir/segments_to_fix
 
-if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then
+if [ -s $tmpdir/segments_to_fix ]; then
   echo "$0. Applying following fixes to segments"
   cat $tmpdir/segments_to_fix
-  while read line; do
-     p1=`echo $line | awk -F'>' '{print $1}'`
-     p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s:$p1:$p2:" $tmpdir/segments
-  done < $tmpdir/segments_to_fix
+  perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments
 fi
 
 # Copy stuff into its final locations [this has been moved from the format_data
 # script]
 mkdir -p $dir
-for f in spk2utt utt2spk utt2spk_stm wav.scp text segments reco2file_and_channel; do
+for f in segments_to_fix spk2utt utt2spk utt2spk_stm wav.scp text segments reco2file_and_channel; do
   cp $tmpdir/$f $dir/$f || exit 1;
 done
 
diff --git a/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh b/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh
index 804de611cae..8297cdee9ca 100755
--- a/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh
+++ b/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh
@@ -114,7 +114,7 @@ cp ${output_dir}_non_normalized/info/* $output_dir/info
 
 # rename file location in the noise-rir pairing files 
 for file in `ls $output_dir/info/noise_impulse*`; do
-  sed -i "s/_non_normalized//g" $file
+  perl -i -pe "s/_non_normalized//g" $file
 done
 
 # generating the rir-list with probabilities alloted for each rir
diff --git a/egs/babel/s5c/local/syllab/generate_syllable_lang.sh b/egs/babel/s5c/local/syllab/generate_syllable_lang.sh
index 2d1fcb2259e..4a0810b9415 100755
--- a/egs/babel/s5c/local/syllab/generate_syllable_lang.sh
+++ b/egs/babel/s5c/local/syllab/generate_syllable_lang.sh
@@ -118,8 +118,7 @@ ln -s lex.syllabs2phones.disambig.fst $out/L_disambig.fst
 echo "Validating the output lang dir"
 utils/validate_lang.pl $out || exit 1
 
-sed -i'' 's/#1$//g' $lout/lexicon.txt
-sed -i'' 's/#1$//g' $lout/lexiconp.txt
+perl -i -pe 's/#1$//g' $lout/lexicon.txt $lout/lexiconp.txt
 
 echo "Done OK."
 exit 0
diff --git a/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf b/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf
index a6b22de419f..9cd043716ce 100644
--- a/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf
+++ b/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf
@@ -75,8 +75,8 @@ unsup_data_list=./conf/lists/404-georgian/untranscribed-training.list
 unsup_nj=32
 
 
-lexicon_file=
-lexiconFlags="--romanized --oov <unk>"
+lexicon_file=/export/corpora/LDC/LDC2016S12/IARPA_BABEL_OP3_404/conversational/reference_materials/lexicon.txt
+lexiconFlags=" --romanized --oov <unk>"
 
 
 
diff --git a/egs/babel/s5d/local/make_L_align.sh b/egs/babel/s5d/local/make_L_align.sh
index 50e46a00493..41e9ff32958 100755
--- a/egs/babel/s5d/local/make_L_align.sh
+++ b/egs/babel/s5d/local/make_L_align.sh
@@ -34,18 +34,24 @@ tmpdir=$1
 dir=$2
 outdir=$3
 
+for f in  $dir/phones/optional_silence.txt $dir/phones.txt $dir/words.txt ; do
+  [ ! -f $f ] &&  echo "$0: The file $f must exist!" exit 1
+fi
+
 silphone=`cat $dir/phones/optional_silence.txt` || exit 1;
 
+if [ ! -f $tmpdir/lexicon.txt ] && [ ! -f $tmpdir/lexiconp.txt ] ; then
+  echo "$0: At least one of the files $tmpdir/lexicon.txt or $tmpdir/lexiconp.txt must exist" >&2
+  exit 1
+fi
+
 # Create lexicon with alignment info
 if  [ -f $tmpdir/lexicon.txt ] ; then
   cat $tmpdir/lexicon.txt | \
     awk '{printf("%s #1 ", $1); for (n=2; n <= NF; n++) { printf("%s ", $n); } print "#2"; }'
-elif [ -f $tmpdir/lexiconp.txt ] ;  then
+else
   cat $tmpdir/lexiconp.txt | \
     awk '{printf("%s #1 ", $1); for (n=3; n <= NF; n++) { printf("%s ", $n); } print "#2"; }'
-else
-  echo "Neither $tmpdir/lexicon.txt nor $tmpdir/lexiconp.txt does not exist"
-  exit 1
 fi | utils/make_lexicon_fst.pl - 0.5 $silphone | \
 fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
   --keep_isymbols=false --keep_osymbols=false | \
diff --git a/egs/babel/s5d/local/syllab/generate_phone_lang.sh b/egs/babel/s5d/local/syllab/generate_phone_lang.sh
index fc21a23231b..81d8a0acdc7 100755
--- a/egs/babel/s5d/local/syllab/generate_phone_lang.sh
+++ b/egs/babel/s5d/local/syllab/generate_phone_lang.sh
@@ -122,8 +122,7 @@ ln -s lex.syllabs2phones.disambig.fst $out/L_disambig.fst
 echo "Validating the output lang dir"
 utils/validate_lang.pl $out || exit 1
 
-sed -i'' 's/#1$//g' $lout/lexicon.txt
-sed -i'' 's/#1$//g' $lout/lexiconp.txt
+perl -i -pe 's/#1$//g' $lout/lexicon.txt $lout/lexiconp.txt
 
 echo "Done OK."
 exit 0
diff --git a/egs/babel/s5d/local/syllab/generate_syllable_lang.sh b/egs/babel/s5d/local/syllab/generate_syllable_lang.sh
index db7b0902425..a7bd667027c 100755
--- a/egs/babel/s5d/local/syllab/generate_syllable_lang.sh
+++ b/egs/babel/s5d/local/syllab/generate_syllable_lang.sh
@@ -122,8 +122,7 @@ ln -s lex.syllabs2phones.disambig.fst $out/L_disambig.fst
 echo "Validating the output lang dir"
 utils/validate_lang.pl $out || exit 1
 
-sed -i'' 's/#1$//g' $lout/lexicon.txt
-sed -i'' 's/#1$//g' $lout/lexiconp.txt
+perl -i -pe 's/#1$//g' $lout/lexicon.txt $lout/lexiconp.txt
 
 echo "Done OK."
 exit 0
diff --git a/egs/bentham/v1/local/create_splits.sh b/egs/bentham/v1/local/create_splits.sh
index 93e8bf1b12e..e8ea2279a49 100755
--- a/egs/bentham/v1/local/create_splits.sh
+++ b/egs/bentham/v1/local/create_splits.sh
@@ -27,10 +27,8 @@ function split {
         echo $name $lines_dir"/"$name".png" >> $split_dir/images.scp
         echo $name $spkid >> $split_dir/utt2spk 
 	done < "$line_file"
-    
-    sed -i '/^\s*$/d' $split_dir/images.scp
-    sed -i '/^\s*$/d' $split_dir/text
-    sed -i '/^\s*$/d' $split_dir/utt2spk
+   
+    perl -i -ne 'print if /\S/' $split_dir/images.scp $split_dir/text $split_dir/utt2spk
     utils/utt2spk_to_spk2utt.pl $split_dir/utt2spk > $split_dir/spk2utt
 }
 
diff --git a/egs/bn_music_speech/v1/local/make_musan.py b/egs/bn_music_speech/v1/local/make_musan.py
index 942973cfc65..eb739b68180 100755
--- a/egs/bn_music_speech/v1/local/make_musan.py
+++ b/egs/bn_music_speech/v1/local/make_musan.py
@@ -45,7 +45,7 @@ def prepare_music(root_dir, use_vocals):
     else:
       print("Missing file {}".format(utt))
       num_bad_files += 1
-  print(("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
+  print("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
   return utt2spk_str, utt2wav_str
 
 def prepare_speech(root_dir):
@@ -71,7 +71,7 @@ def prepare_speech(root_dir):
     else:
       print("Missing file {}".format(utt))
       num_bad_files += 1
-  print(("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
+  print("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
   return utt2spk_str, utt2wav_str
 
 def prepare_noise(root_dir):
@@ -97,7 +97,7 @@ def prepare_noise(root_dir):
     else:
       print("Missing file {}".format(utt))
       num_bad_files += 1
-  print(("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
+  print("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
   return utt2spk_str, utt2wav_str
 
 def main():
diff --git a/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh b/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh
index d7591a6a3a8..8d579138c73 100755
--- a/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh
+++ b/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh
@@ -102,7 +102,7 @@ if [ $stage -le 0 ]; then
   fi
   utils/data/get_uniform_subsegments.py \
       --max-segment-duration=$window \
-      --overlap-duration=$(echo "$window-$period" | bc) \
+      --overlap-duration=$(perl -e "print ($window-$period);") \
       --max-remaining-duration=$min_segment \
       --constant-duration=True \
       $segments > $dir/subsegments
diff --git a/egs/callhome_diarization/v1/local/make_musan.py b/egs/callhome_diarization/v1/local/make_musan.py
index 974e73e0777..7c50adf7c83 100755
--- a/egs/callhome_diarization/v1/local/make_musan.py
+++ b/egs/callhome_diarization/v1/local/make_musan.py
@@ -45,7 +45,7 @@ def prepare_music(root_dir, use_vocals):
     else:
       print("Missing file: {}".format(utt))
       num_bad_files += 1
-  print("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files)
+  print("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
   return utt2spk_str, utt2wav_str
 
 def prepare_speech(root_dir):
@@ -71,7 +71,7 @@ def prepare_speech(root_dir):
     else:
       print("Missing file: {}".format(utt))
       num_bad_files += 1
-  print("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files)
+  print("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
   return utt2spk_str, utt2wav_str
 
 def prepare_noise(root_dir):
@@ -97,7 +97,7 @@ def prepare_noise(root_dir):
     else:
       print("Missing file: {}".format(utt))
       num_bad_files += 1
-  print("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files)
+  print("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
   return utt2spk_str, utt2wav_str
 
 def main():
diff --git a/egs/callhome_diarization/v1/run.sh b/egs/callhome_diarization/v1/run.sh
index acc48bd24f9..f4652c0c0ef 100755
--- a/egs/callhome_diarization/v1/run.sh
+++ b/egs/callhome_diarization/v1/run.sh
@@ -188,7 +188,7 @@ if [ $stage -le 6 ]; then
 
       der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
         exp/tuning/${dataset}_t${threshold})
-      if [ $(echo $der'<'$best_der | bc -l) -eq 1 ]; then
+      if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then
         best_der=$der
         best_threshold=$threshold
       fi
diff --git a/egs/callhome_diarization/v2/run.sh b/egs/callhome_diarization/v2/run.sh
index 4f730d4753c..b79717e2348 100755
--- a/egs/callhome_diarization/v2/run.sh
+++ b/egs/callhome_diarization/v2/run.sh
@@ -115,7 +115,7 @@ if [ $stage -le 2 ]; then
 
   # Make a reverberated version of the SWBD+SRE list.  Note that we don't add any
   # additive noise here.
-  python steps/data/reverberate_data_dir.py \
+  steps/data/reverberate_data_dir.py \
     "${rvb_opts[@]}" \
     --speech-rvb-probability 1 \
     --pointsource-noise-addition-probability 0 \
@@ -140,11 +140,11 @@ if [ $stage -le 2 ]; then
   done
 
   # Augment with musan_noise
-  python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise
+  steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise
   # Augment with musan_music
-  python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music
+  steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music
   # Augment with musan_speech
-  python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble
+  steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble
 
   # Combine reverb, noise, music, and babble into one directory.
   utils/combine_data.sh data/train_aug data/train_reverb data/train_noise data/train_music data/train_babble
@@ -297,7 +297,7 @@ if [ $stage -le 10 ]; then
 
       der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
         $nnet_dir/tuning/${dataset}_t${threshold})
-      if [ $(echo $der'<'$best_der | bc -l) -eq 1 ]; then
+      if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then
         best_der=$der
         best_threshold=$threshold
       fi
diff --git a/egs/callhome_egyptian/s5/local/callhome_prepare_dict.sh b/egs/callhome_egyptian/s5/local/callhome_prepare_dict.sh
index 62bca974e53..d9faa97f266 100755
--- a/egs/callhome_egyptian/s5/local/callhome_prepare_dict.sh
+++ b/egs/callhome_egyptian/s5/local/callhome_prepare_dict.sh
@@ -54,9 +54,8 @@ cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > \
 $dir/extra_questions.txt || exit 1;
 
 # Add prons for laughter, noise, oov
-for w in `grep -v sil $dir/silence_phones.txt`; do
-sed -i "/\[$w\]/d" $tmpdir/lexicon.3
-done
+w=$(grep -v sil $dir/silence_phones.txt | tr '\n' '|')
+perl -i -ne "print unless /\[(${w%?})\]/"  $tmpdir/lexicon.3
 
 for w in `grep -v sil $dir/silence_phones.txt`; do
 echo "[$w] $w"
diff --git a/egs/callhome_egyptian/s5/local/ctm.sh b/egs/callhome_egyptian/s5/local/ctm.sh
index 14056b7a44b..64a7cf0d4f6 100755
--- a/egs/callhome_egyptian/s5/local/ctm.sh
+++ b/egs/callhome_egyptian/s5/local/ctm.sh
@@ -18,9 +18,9 @@ fi
 steps/get_ctm.sh $data_dir $lang_dir $decode_dir
 
 # Make sure that channel markers match
-#sed -i "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s1\s:fsp A :g' {}
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s2\s:fsp B :g' {}
+#perl -i -pe "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s1\s:fsp A :g' {}
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s2\s:fsp B :g' {}
 
 # Get the environment variables
 . /export/babel/data/software/env.sh
diff --git a/egs/csj/s5/local/csj_make_trans/csj_autorun.sh b/egs/csj/s5/local/csj_make_trans/csj_autorun.sh
index f288e4fb4d3..5cd78ee94ae 100755
--- a/egs/csj/s5/local/csj_make_trans/csj_autorun.sh
+++ b/egs/csj/s5/local/csj_make_trans/csj_autorun.sh
@@ -61,7 +61,7 @@ if [ ! -e $outd/.done_make_trans ];then
                 mkdir -p $outd/$vol/$id
 
                 case "$csjv" in
-                    "usb" ) TPATH="$resource/${SDB}$vol" ; WPATH="$resource/$WAV" ;;
+                    "usb" ) TPATH="$resource/${SDB}$vol" ; WPATH="$resource/${WAV}$vol" ;;
                     "dvd" ) TPATH="$resource/$vol/$id"   ; WPATH="$resource/$vol/$id" ;;
                     "merl" ) TPATH="$resource/$vol/$SDB" ; WPATH="$resource/$vol/$WAV" ;;
                 esac
diff --git a/egs/dihard_2018/v1/run.sh b/egs/dihard_2018/v1/run.sh
index 429a1231975..44af9f48c3f 100755
--- a/egs/dihard_2018/v1/run.sh
+++ b/egs/dihard_2018/v1/run.sh
@@ -186,7 +186,7 @@ if [ $stage -le 7 ]; then
 
     der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
       $ivec_dir/tuning/dihard_2018_dev_t${threshold})
-    if [ $(echo $der'<'$best_der | bc -l) -eq 1 ]; then
+    if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then
       best_der=$der
       best_threshold=$threshold
     fi
diff --git a/egs/dihard_2018/v2/run.sh b/egs/dihard_2018/v2/run.sh
index 1c018dfcc55..0da1f330ea7 100755
--- a/egs/dihard_2018/v2/run.sh
+++ b/egs/dihard_2018/v2/run.sh
@@ -260,7 +260,7 @@ if [ $stage -le 12 ]; then
 
     der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
       $nnet_dir/tuning/dihard_2018_dev_t${threshold})
-    if [ $(echo $der'<'$best_der | bc -l) -eq 1 ]; then
+    if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then
       best_der=$der
       best_threshold=$threshold
     fi
diff --git a/egs/fisher_callhome_spanish/s5/local/ctm.sh b/egs/fisher_callhome_spanish/s5/local/ctm.sh
index 7d09f574580..62860a10b7b 100755
--- a/egs/fisher_callhome_spanish/s5/local/ctm.sh
+++ b/egs/fisher_callhome_spanish/s5/local/ctm.sh
@@ -19,9 +19,9 @@ fi
 steps/get_ctm.sh $data_dir $lang_dir $decode_dir
 
 # Make sure that channel markers match
-#sed -i "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s1\s:fsp A :g' {}
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s2\s:fsp B :g' {}
+#perl -i -pe "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s1\s:fsp A :g' {}
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s2\s:fsp B :g' {}
 
 # Get the environment variables
 . /export/babel/data/software/env.sh
diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
index 7b2de2db392..779298305c4 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
@@ -105,9 +105,8 @@ if [ $stage -le 4 ]; then
   cp "$tmpdir/lexicon.1" "$tmpdir/lexicon.2"
 
   # Add prons for laughter, noise, oov
-  for w in `grep -v sil $dir/silence_phones.txt`; do
-    sed -i "/\[$w\]/d" $tmpdir/lexicon.2
-  done
+  w=$(grep -v sil $dir/silence_phones.txt | tr '\n' '|')
+  perl -i -ne "print unless /\[(${w%?})\]/"  $tmpdir/lexicon.2
 
   for w in `grep -v sil $dir/silence_phones.txt`; do
     echo "[$w] $w"
diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
index 864b76b671b..b42eb52d20a 100755
--- a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
+++ b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
@@ -1,11 +1,11 @@
-#!/usr/bin/env python
 # Copyright 2014  Gaurav Kumar.   Apache 2.0
+#    2018  Saikiran Valluri, GoVivace inc., Avaaya
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
 # Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon
-
 from __future__ import print_function
-import sys
+import sys, re
 import json
 import codecs
 import operator
@@ -17,6 +17,7 @@
 uw_gigaword = tmpdir + "/es_wordlist.json"
 uw_LDC = ldc_lexicon + "/callhome_spanish_lexicon_970908/preferences"
 
+filtered_letters = re.compile(u'[¡¥ª°º¿àçèëìîôö0123456789]')
 merged_lexicon = []
 # All three lexicons are in different formats
 # First add the data from lexicon_fisher (A) into the dictionary
@@ -55,7 +56,8 @@
 ltuples = sorted(merged_lexicon)
 
 for item in ltuples:
-    lf.write(item + "\n")
+    if not item==u'ñ' and not re.search(filtered_letters, item):
+        lf.write(item + "\n")
 
 lf.close()
 
diff --git a/egs/gale_arabic/s5/local/gale_prep_dict.sh b/egs/gale_arabic/s5/local/gale_prep_dict.sh
index 74ef789eda7..f6fd83378d0 100755
--- a/egs/gale_arabic/s5/local/gale_prep_dict.sh
+++ b/egs/gale_arabic/s5/local/gale_prep_dict.sh
@@ -25,9 +25,8 @@ echo SIL > $dir/optional_silence.txt
 cat $dir/lexicon.txt | cut -d ' ' -f2- | tr -s ' ' '\n' |\
 sort -u >  $dir/nonsilence_phones.txt || exit 1;
 
+perl -i -pe 'print "<UNK> SIL\n" if $.==1'  $dir/lexicon.txt
 
- sed -i '1i<UNK> SIL' $dir/lexicon.txt
- 
 echo Dictionary preparation succeeded
 
 exit 0
diff --git a/egs/gale_arabic/s5b/RESULTS b/egs/gale_arabic/s5b/RESULTS
index 2260a106654..a485240ff6b 100644
--- a/egs/gale_arabic/s5b/RESULTS
+++ b/egs/gale_arabic/s5b/RESULTS
@@ -2,13 +2,7 @@
 # This file is generated using local/split_wer.sh $galeData  //galeData is a local folder to keep intermediate gale data
 # look at the end of run.sh in the same folder 
 ##
-##### RESULTS generated by amali at 2017-01-01-08-05-59
-
 Report Results WER:
-%WER 9.50 [ 2124 / 22363, 160 ins, 275 del, 1689 sub ] exp/chain_cleaned/tdnn_lstm1a_sp_bi/decode/wer_report_9
-%WER 10.72 [ 2398 / 22363, 163 ins, 313 del, 1922 sub ] exp/chain_cleaned/tdnn1b_sp_bi/decode/wer_report_9
-%WER 12.04 [ 2693 / 22363, 226 ins, 271 del, 2196 sub ] exp/nnet3_cleaned/lstm_ld5_sp/decode/wer_report_9
-%WER 12.29 [ 2749 / 22363, 273 ins, 266 del, 2210 sub ] exp/nnet3_cleaned/tdnn_sp/decode/wer_report_10
 %WER 17.82 [ 3986 / 22363, 315 ins, 618 del, 3053 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_report_12
 %WER 18.15 [ 4059 / 22363, 335 ins, 589 del, 3135 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_report_11
 %WER 18.42 [ 4119 / 22363, 346 ins, 590 del, 3183 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_report_11
@@ -27,10 +21,6 @@ Report Results WER:
 %WER 25.66 [ 5738 / 22363, 478 ins, 838 del, 4422 sub ] exp/tri2a/decode/wer_report_14
 %WER 26.38 [ 5900 / 22363, 435 ins, 929 del, 4536 sub ] exp/tri1/decode/wer_report_15
 Conversational Results WER:
-%WER 21.59 [ 10213 / 47305, 944 ins, 3092 del, 6177 sub ] exp/chain_cleaned/tdnn_lstm1a_sp_bi/decode/wer_conversational_9
-%WER 24.77 [ 11716 / 47305, 1098 ins, 3579 del, 7039 sub ] exp/chain_cleaned/tdnn1b_sp_bi/decode/wer_conversational_9
-%WER 26.78 [ 12670 / 47305, 1741 ins, 2434 del, 8495 sub ] exp/nnet3_cleaned/lstm_ld5_sp/decode/wer_conversational_9
-%WER 27.55 [ 13032 / 47305, 1800 ins, 2666 del, 8566 sub ] exp/nnet3_cleaned/tdnn_sp/decode/wer_conversational_11
 %WER 34.10 [ 16133 / 47305, 1903 ins, 3245 del, 10985 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_conversational_11
 %WER 34.81 [ 16466 / 47305, 2077 ins, 3037 del, 11352 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_conversational_10
 %WER 35.19 [ 16648 / 47305, 1933 ins, 3264 del, 11451 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_conversational_11
@@ -49,10 +39,6 @@ Conversational Results WER:
 %WER 45.92 [ 21724 / 47305, 1995 ins, 5213 del, 14516 sub ] exp/tri2a/decode/wer_conversational_14
 %WER 46.86 [ 22166 / 47305, 2212 ins, 4819 del, 15135 sub ] exp/tri1/decode/wer_conversational_13
 Combined Results for Reports and Conversational WER:
-%WER 17.64 [ 12286 / 69668, 1310 ins, 2807 del, 8169 sub ] exp/chain_cleaned/tdnn_lstm1a_sp_bi/decode/wer_8
-%WER 20.26 [ 14114 / 69668, 1261 ins, 3892 del, 8961 sub ] exp/chain_cleaned/tdnn1b_sp_bi/decode/wer_9
-%WER 22.05 [ 15363 / 69668, 1967 ins, 2705 del, 10691 sub ] exp/nnet3_cleaned/lstm_ld5_sp/decode/wer_9
-%WER 22.66 [ 15786 / 69668, 2047 ins, 2955 del, 10784 sub ] exp/nnet3_cleaned/tdnn_sp/decode/wer_11
 %WER 28.89 [ 20127 / 69668, 2244 ins, 3829 del, 14054 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_11
 %WER 29.48 [ 20541 / 69668, 2243 ins, 3860 del, 14438 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_11
 %WER 29.81 [ 20767 / 69668, 2279 ins, 3854 del, 14634 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_11
@@ -65,8 +51,30 @@ Combined Results for Reports and Conversational WER:
 %WER 32.36 [ 22542 / 69668, 2156 ins, 4184 del, 16202 sub ] exp/tri2b_mmi/decode_it4/wer_11
 %WER 32.50 [ 22640 / 69668, 2393 ins, 3956 del, 16291 sub ] exp/tri2b_mmi/decode_it3/wer_11
 %WER 32.79 [ 22847 / 69668, 2407 ins, 4760 del, 15680 sub ] exp/tri2b_mpe/decode_it3/wer_13
+# WER with train_sat_basis
+%WER 33.35 [ 23233 / 69668, 2385 ins, 5274 del, 15574 sub ] exp/tri3b/decode/wer_16_0.5
+# WER with train_sat
 %WER 33.61 [ 23413 / 69668, 2817 ins, 4577 del, 16019 sub ] exp/tri3b/decode/wer_17
 %WER 35.73 [ 24894 / 69668, 2630 ins, 4944 del, 17320 sub ] exp/tri3b/decode.si/wer_15
 %WER 36.17 [ 25196 / 69668, 2429 ins, 5393 del, 17374 sub ] exp/tri2b/decode/wer_16
 %WER 39.42 [ 27462 / 69668, 2473 ins, 6051 del, 18938 sub ] exp/tri2a/decode/wer_14
 %WER 40.35 [ 28113 / 69668, 2713 ins, 5635 del, 19765 sub ] exp/tri1/decode/wer_13
+
+
+# Effect of GMM seed model (tri2b instead of tri3b).  Using tri3b give a slightly better result
+# as compared to using tri2b as seed.
+%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3b_sp/decode_test/wer_10_0.0
+%WER 16.71 [ 11642 / 69668, 1145 ins, 2908 del, 7589 sub ] exp/chain/tdnn_1a_2b_sp/decode_test/wer_9_0.0
+
+# Effect of Tree-size (3500, 4500, 7000, 11000)
+%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3500_sp/decode_test/wer_10_0.0
+%WER 16.59 [ 11557 / 69668, 1234 ins, 2646 del, 7677 sub ] exp/chain/tdnn_1a_4500_sp/decode_test/wer_10_0.0
+%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_sp/decode_test/wer_9_0.0
+%WER 16.62 [ 11580 / 69668, 1164 ins, 2789 del, 7627 sub ] exp/chain/tdnn_1a_11000_sp/decode_test/wer_10_0.0
+
+# Effect of l2-regularization on the output with tree-size=7000. l2 on the output (0.005,0.002)
+%WER 16.54 [ 11522 / 69668, 1123 ins, 2739 del, 7660 sub ] exp/chain/tdnn_1a_7000_005_sp/decode_test/wer_9_0.5
+%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_002_sp/decode_test/wer_9_0.0
+
+#current best 'chain' models  (see local/chain/tuning/run_tdnn_1a.sh)
+%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_9_0.0
diff --git a/egs/gale_arabic/s5b/cmd.sh b/egs/gale_arabic/s5b/cmd.sh
index 71dd849a93b..ea341c98d4a 100755
--- a/egs/gale_arabic/s5b/cmd.sh
+++ b/egs/gale_arabic/s5b/cmd.sh
@@ -10,6 +10,6 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-export train_cmd="queue.pl --mem 2G"
-export decode_cmd="queue.pl --mem 4G"
-export mkgraph_cmd="queue.pl --mem 8G"
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="retry.pl queue.pl --mem 4G"
+export mkgraph_cmd="retry.pl queue.pl --mem 8G"
diff --git a/egs/gale_arabic/s5b/local/chain/compare_wer.sh b/egs/gale_arabic/s5b/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..1a40523355a
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/chain/compare_wer.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# ./local/chain/compare_wer.sh exp/chain/cnn1a
+# System                          cnn1a
+# WER                              0.61
+# CER                              0.15
+# Final train prob              -0.0377
+# Final valid prob              -0.0380
+# Final train prob (xent)       -0.0830
+# Final valid prob (xent)       -0.0838
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/gale_arabic/s5b/local/chain/run_chain_common.sh b/egs/gale_arabic/s5b/local/chain/run_chain_common.sh
new file mode 100755
index 00000000000..da37e148441
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/chain/run_chain_common.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# this script has common stages shared across librispeech chain recipes.
+# It generates a new topology in a new lang directory, gets the alignments as
+# lattices, and builds a tree for the new topology
+set -e
+
+stage=11
+
+# input directory names. These options are actually compulsory, and they have
+# been named for convenience
+gmm_dir=
+ali_dir=
+lores_train_data_dir=
+
+num_leaves=6000
+
+# output directory names. They are also compulsory.
+lang=
+lat_dir=
+tree_dir=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1;
+[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1;
+[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1;
+
+for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 12 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat ${ali_dir}/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \
+    $lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology. We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
index 7afafb31ff6..a3ccfda04ac 100755
--- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -1,31 +1,51 @@
 #!/bin/bash
 
-#started from tedlium recipe with few edits
+# ./local/chain/compare_wer.sh exp/chain/tdnn_1a_sp
+# System                      tdnn_1a_sp
+# WER                             16.47
+# CER                              6.68
+# Final train prob              -0.0652
+# Final valid prob              -0.0831
+# Final train prob (xent)       -0.8965
+# Final valid prob (xent)       -0.9964
 
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1a_sp/
+# exp/chain/tdnn_1a_sp/: num-iters=441 nj=3..16 num-params=18.6M dim=40+100->5816 combine=-0.063->-0.062 (over 6) xent:train/valid[293,440,final]=(-1.22,-0.912,-0.896/-1.29,-1.01,-0.996) logprob:train/valid[293,440,final]=(-0.097,-0.066,-0.065/-0.108,-0.084,-0.083)
 
-set -e -o pipefail
 
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
+set -e -o pipefail
 stage=0
 nj=30
-decode_nj=30
-min_seg_len=1.55
-xent_regularize=0.1
 train_set=train
-gmm=tri2b # the gmm for the target data
+test_set=test
+gmm=tri3b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
 num_threads_ubm=32
-nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-
-# The rest are configs specific to this script.  Most of the parameters
-# are just hardcoded at this level, in the commands below.
-train_stage=-10 #default -10
-tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_affix=1b  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=  # you can set this to use previously dumped egs.
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=_1a   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training chunk-options
+chunk_width=150,110,100
+get_egs_stage=-10
+
+# training options
+srand=0
+remove_egs=true
+run_ivector_common=true
+run_chain_common=true
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
+
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
@@ -39,169 +59,162 @@ where "nvcc" is installed.
 EOF
 fi
 
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                                  --nnet3-affix "$nnet3_affix"
-
-
-gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
-train_data_dir=data/${train_set}_sp_hires_comb
-lores_train_data_dir=data/${train_set}_sp_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
-
+if $run_ivector_common; then
+  local/nnet3/run_ivector_common.sh \
+    --stage $stage --nj $nj \
+    --train-set $train_set --gmm $gmm \
+    --num-threads-ubm $num_threads_ubm \
+    --nnet3-affix "$nnet3_affix"
+fi
 
-for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
 done
 
-if [ $stage -le 14 ]; then
-  echo "$0: creating lang directory with one state per phone."
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  if [ -d data/lang_chain ]; then
-    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
-      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
-    else
-      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
-      echo " ... not sure what to do.  Exiting."
-      exit 1;
-    fi
-  else
-    cp -r data/lang data/lang_chain
-    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
-    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
-    # Use our special topology... note that later on may have to tune this
-    # topology.
-    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
-  fi
+# Please take this as a reference on how to specify all the options of
+# local/chain/run_chain_common.sh
+if $run_chain_common; then
+  local/chain/run_chain_common.sh --stage $stage \
+                                  --gmm-dir $gmm_dir \
+                                  --ali-dir $ali_dir \
+                                  --lores-train-data-dir ${lores_train_data_dir} \
+                                  --lang $lang \
+                                  --lat-dir $lat_dir \
+                                  --num-leaves 7000 \
+                                  --tree-dir $tree_dir || exit 1;
 fi
 
 if [ $stage -le 15 ]; then
-  # Get the alignments as lattices (gives the chain training more freedom).
-  # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang $gmm_dir $lat_dir
-  rm $lat_dir/fsts.*.gz # save space
-fi
-
-if [ $stage -le 16 ]; then
-  # Build a tree using our new topology.  We know we have alignments for the
-  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
-  # those.
-  if [ -f $tree_dir/final.mdl ]; then
-    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
-    exit 1;
-  fi
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
-fi
-
-if [ $stage -le 17 ]; then
   mkdir -p $dir
-
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
 
   mkdir -p $dir/configs
+
   cat <<EOF > $dir/configs/network.xconfig
   input dim=100 name=ivector
   input dim=40 name=input
-
   # please note that it is important to have input layer with the name=input
   # as the layer immediately preceding the fixed-affine-layer to enable
   # the use of short notation for the descriptor
   fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
   # the first splicing is moved before the lda layer, so no splicing here
-  relu-renorm-layer name=tdnn1 dim=450
-  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450
-  relu-renorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=450
-  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450
-  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450
-  relu-renorm-layer name=tdnn6 input=Append(-6,-3,0) dim=450
-
-  ## adding the layers for chain branch
-  relu-renorm-layer name=prefinal-chain input=tdnn6 dim=450 target-rms=0.5
-  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
-
-  # adding the layers for xent branch
-  # This block prints the configs for a separate output that will be
-  # trained with a cross-entropy objective in the 'chain' models... this
-  # has the effect of regularizing the hidden parts of the model.  we use
-  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
-  # 0.5 / args.xent_regularize is suitable as it means the xent
-  # final-layer learns at a rate independent of the regularization
-  # constant; and the 0.5 was tuned so as to make the relative progress
-  # similar in the xent and regular final layers.
-  relu-renorm-layer name=prefinal-xent input=tdnn6 dim=450 target-rms=0.5
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
-
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
 EOF
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-
 fi
 
-if [ $stage -le 18 ]; then
+
+if [ $stage -le 16 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
   fi
 
- steps/nnet3/chain/train.py --stage $train_stage \
+  steps/nnet3/chain/train.py --stage $train_stage \
     --cmd "$decode_cmd" \
     --feat.online-ivector-dir $train_ivector_dir \
     --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
     --chain.xent-regularize $xent_regularize \
     --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.00005 \
+    --chain.l2-regularize 0.0 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width 150 \
-    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs 6 \
     --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.num-jobs-initial 2 \
-    --trainer.optimization.num-jobs-final 2 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs true \
-    --feat-dir $train_data_dir \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.stage $get_egs_stage \
+    --reporting.email="$reporting_email" \
+    --cleanup.remove-egs=$remove_egs \
+    --feat-dir=$train_data_dir \
     --tree-dir $tree_dir \
-    --lat-dir $lat_dir \
-    --dir $dir
-fi
-
+    --lat-dir=$lat_dir \
+    --dir $dir  || exit 1;
 
+fi
 
-if [ $stage -le 19 ]; then
-  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph || exit 1;
 fi
 
-if [ $stage -le 20 ]; then
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   rm $dir/.error 2>/dev/null || true
-  steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
-    --acwt 1.0 --post-decode-acwt 10.0 \
-    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
-    --scoring-opts "--min-lmwt 5 " \
-    $dir/graph data/test_hires $dir/decode || exit 1;
+
+    steps/nnet3/decode.sh \
+      --acwt 1.0 --post-decode-acwt 10.0 \
+      --extra-left-context 0 --extra-right-context 0 \
+      --extra-left-context-initial 0 \
+      --extra-right-context-final 0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
+      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test_set}_hires \
+      $tree_dir/graph data/${test_set}_hires ${dir}/decode_${test_set} || exit 1
 fi
-exit 0
diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh b/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh
deleted file mode 100755
index 0125272d06c..00000000000
--- a/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash 
-
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-
-galeData=$(utils/make_absolute.sh "${@: -1}" );  # last argumnet; the local folder
-audio_dvds=${@:1:${#}-1} # all the audio dvds for GALE corpus; ; check audio=( in ../run.sh
-
-mkdir -p $galeData 
-
-# check that sox is installed 
-which sox  &>/dev/null
-if [[ $? != 0 ]]; then 
- echo "sox is not installed"; exit 1 
-fi
-
-for dvd in $audio_dvds; do
-  dvd_full_path=$(utils/make_absolute.sh $dvd)
-  if [[ ! -e $dvd_full_path ]]; then 
-    echo missing $dvd_full_path; exit 1;
-  fi
-  find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \)  | while read file; do
-    id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}')
-    echo "$id sox $file -r 16000 -t wav - |"
-  done 
-done | sort -u > $galeData/wav.scp
-
-echo data prep audio succeded
-
-exit 0
-
diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_split.sh b/egs/gale_arabic/s5b/local/gale_data_prep_split.sh
deleted file mode 100755
index b18a4e5b105..00000000000
--- a/egs/gale_arabic/s5b/local/gale_data_prep_split.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash 
-
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-if [ $# -ne 1 ]; then
-   echo "Arguments should be the <gale folder>"; exit 1
-fi
-
-
-#data will data/local
-
-galeData=$(utils/make_absolute.sh $1)
-mkdir -p data/local
-dir=$(utils/make_absolute.sh data/local)
-
-
-grep -f local/test_list $galeData/all | grep -v -f local/bad_segments > $galeData/all.test
-grep -v -f local/test_list $galeData/all | grep -v -f local/bad_segments > $galeData/all.train 
-
-for x in test train; do
- outdir=$dir/$x
- file=$galeData/all.$x 
- mkdir -p $outdir
- awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk 
- cp -pr $outdir/utt2spk $outdir/spk2utt
- awk '{print $2 " " $1 " " $3 " " $4}' $file  | sort -u > $outdir/segments
- awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
-done 
-
-
-grep -f local/test_list $galeData/wav.scp > $dir/test/wav.scp
-
-cat $galeData/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
- {if (seen[$1]) { print $0}}' > $dir/train/wav.scp
- 
-echo data prep split succeeded
-
-exit 0
diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh b/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh
deleted file mode 100755
index 04529d88ac0..00000000000
--- a/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-galeData=$(utils/make_absolute.sh "${@: -1}" );  # last argumnet; the local folder
-txt_dvds=${@:1:${#}-1} # all the txt cds correspoding to the audio corpus; check text=( in ../run.sh
-
-
-top_pwd=`pwd`
-txtdir=$galeData/txt
-mkdir -p $txtdir; cd $txtdir
-
-for cdx in $txt_dvds; do
-  echo "Preparing $cdx"
-  if [[ $cdx  == *.tgz ]] ; then
-     tar -xvf $cdx
-  elif [  -d "$cdx" ]; then
-    ln -s $cdx `basename $cdx`
-  else
-    echo "I don't really know what I shall do with $cdx " >&2
-  fi
-done
-
-find -L . -type f -name "*.tdf" | while read file; do
-sed '1,3d' $file  # delete the first 3 lines
-done >  all.tmp$$
-
-perl -e '
-    ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0];
-    open(IN, "$inFile");
-    open(ID, ">$idFile");
-    open(TXT, ">$txtFile");
-    while (<IN>) {
-      @arr= split /\t/,$_;
-      $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
-      $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//;
-      if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";}
-      $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n";
-      next if ($rStart == $rEnd);
-      $id =~ s/.sph//g;
-      print ID $id;
-      print TXT "$arr[7]\n";
- }' "all.tmp$$ allid.tmp$$ contentall.tmp$$"
-
-
-perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$
-
-paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}'  > all_1.tmp$$
-
-awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $galeData/all
-awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' >  $galeData/report
-awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $galeData/conversational
-
-cd ..;
-rm -fr $txtdir
-cd $top_pwd
-echo data prep text succeeded
-
-exit 0
diff --git a/egs/gale_arabic/s5b/local/gale_format_data.sh b/egs/gale_arabic/s5b/local/gale_format_data.sh
deleted file mode 100755
index b69c34e68b9..00000000000
--- a/egs/gale_arabic/s5b/local/gale_format_data.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-if [ -f path.sh ]; then
-  . ./path.sh; else
-   echo "$0: missing path.sh"; exit 1;
-fi
-
-for dir in test train; do
-   cp -pr data/local/$dir data/$dir
-done
-
-
-mkdir -p data/lang_test
-
-arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
-[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
-
-rm -r data/lang_test
-cp -r data/lang data/lang_test
-
-gunzip -c "$arpa_lm" | \
-  arpa2fst --disambig-symbol=#0 \
-           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
-
-
-echo  "$0: Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst
-
-## Check lexicon.
-## just have a look and make sure it seems sane.
-echo "$0: First few lines of lexicon FST:"
-fstprint   --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst  | head
-
-echo "$0: Performing further checks"
-
-# Checking that G.fst is determinizable.
-fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
-
-# Checking that L_disambig.fst is determinizable.
-fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
-
-# Checking that disambiguated lexicon times G is determinizable
-# Note: we do this with fstdeterminizestar not fstdeterminize, as
-# fstdeterminize was taking forever (presumbaly relates to a bug
-# in this version of OpenFst that makes determinization slow for
-# some case).
-fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
-   fstdeterminizestar >/dev/null || echo Error
-
-# Checking that LG is stochastic:
-fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
-   fstisstochastic || echo LG is not stochastic
-
-
-echo gale_format_data succeeded.
-
-exit 0
diff --git a/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh b/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh
deleted file mode 100755
index 5f101f8245b..00000000000
--- a/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-
-# Copyright 2017 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-
-# run this from ../
-dir=$(utils/make_absolute.sh data/local/dict)
-mkdir -p $dir
-
-
-# (1) Get all avaialble  dictionaries, since this is a grapheme model, so we mainly need the most frequent word lists
-wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2  || exit 1;
-wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2  || exit 1;
-bzcat ar-ar_grapheme_lexicon_2016-02-09.bz2  | sed '1,3d' | awk '{print $1}'  >  tmp$$
-bzcat ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >>  tmp$$
-# (2) Now we add all the words appeared in the training data
-cat data/local/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> tmp$$
-grep -v [0-9] tmp$$ |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > tmp1.$$ # remove vowels and  rare alef wasla
-cat tmp1.$$ | sed 's:\(\):\1 :g' | sed -e 's:  : :g' -e 's:  : :g' -e 's:\s*: :g' -e  's:\*:V:g' > tmp2.$$
-paste -d ' ' tmp1.$$ tmp2.$$ > $dir/lexicon.txt 
-
-#(2) Dictionary preparation:
-
-# silence phones, one per line.
-echo SIL > $dir/silence_phones.txt
-echo SIL > $dir/optional_silence.txt
-
-# nonsilence phones; on each line is a list of phones that correspond
-# really to the same base phone.
-cat tmp2.$$ | tr -s ' ' '\n' | grep -v ^$  | sort -u >  $dir/nonsilence_phones.txt || exit 1;
-
-sed -i '1i<UNK> SIL' $dir/lexicon.txt # insert word <UNK> with phone sil at the begining of the dictionary
-
-rm -fr ar-ar_lexicon_2014-03-17.txt.bz2 ar-ar_grapheme_lexicon_2016-02-09.bz2 tmp$$ tmp1.$$ tmp2.$$ 
-echo Dictionary preparation succeeded
-
-# The script is still missing dates and numbers 
-
-exit 0 
-
diff --git a/egs/gale_arabic/s5b/local/gale_train_lms.sh b/egs/gale_arabic/s5b/local/gale_train_lms.sh
deleted file mode 100755
index 3988ec3818f..00000000000
--- a/egs/gale_arabic/s5b/local/gale_train_lms.sh
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/bin/bash
-
-
-# To be run from one directory above this script.
-
-
-lexicon=data/local/dict/lexicon.txt 
-[ ! -f $lexicon ] && echo "$0: No such file $lexicon" && exit 1;
-
-
-# This script takes no arguments.  It assumes you have already run
-# previus steps successfully
-# It takes as input the files
-#data/local/train.*/text
-#data/local/dict/lexicon.txt
-
-
-export LC_ALL=C # You'll get errors about things being not sorted, if you
-# have a different locale.
-export PATH=$PATH:./../../../tools/kaldi_lm
-( # First make sure the kaldi_lm toolkit is installed.
- cd $KALDI_ROOT/tools || exit 1;
- if [ -d kaldi_lm ]; then
-   echo Not installing the kaldi_lm toolkit since it is already there.
- else
-   echo Downloading and installing the kaldi_lm tools
-   if [ ! -f kaldi_lm.tar.gz ]; then
-     wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
-   fi
-   tar -xvzf kaldi_lm.tar.gz || exit 1;
-   cd kaldi_lm
-   make || exit 1;
-   echo Done making the kaldi_lm tools
- fi
-) || exit 1;
-
-
-dir=data/local/lm
- mkdir -p $dir
- text=data/local/train/text
- [ ! -f $text ] && echo "$0: No such file $text" && exit 1;
- 
- cleantext=$dir/text.no_oov
-
- cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } } 
-   {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<UNK> ",$n);} } printf("\n");}' \
-   > $cleantext || exit 1;
-
-
- cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
-    sort -nr > $dir/word.counts || exit 1;
-
-
-# Get counts from acoustic training transcripts, and add  one-count
-# for each word in the lexicon (but not silence, we don't want it
-# in the LM-- we'll add it optionally later).
- cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
-   cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
-    sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
-
-# note: we probably won't really make use of <UNK> as there aren't any OOVs
- cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map \
-    || exit 1;
-
-# note: ignore 1st field of train.txt, it's the utterance-id.
- cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
-   { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
-    || exit 1;
- 
- train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
-
-# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
-# Perplexity over 128254.000000 words is 90.446690
-
-# note: output is
-# data/local/lm/3gram-mincount/lm_unpruned.gz 
-
-
-echo train lm succeeded
-
-exit 0 
diff --git a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
index f14c8441869..f071842dc0b 100755
--- a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
+++ b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
@@ -2,31 +2,29 @@
 
 set -e -o pipefail
 
-# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually
-# be called by more scripts).  It contains the common feature preparation and iVector-related parts
-# of the script.  See those scripts for examples of usage.
+# This script is called from scripts like local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more scripts).  It
+# contains the common feature preparation and iVector-related parts of the
+# script.  See those scripts for examples of usage.
 
 
 stage=0
 nj=100
-min_seg_len=1.55  # min length in seconds... we do this because chain training
-                  # will discard segments shorter than 1.5 seconds.   Must remain in sync
-                  # with the same option given to prepare_lores_feats_and_alignments.sh
 train_set=train   # you might set this to e.g. train.
-gmm=tri2b         # This specifies a GMM-dir from the features of the type you're training the system on;
+test_sets="test"
+gmm=tri3b # This specifies a GMM-dir from the features of the type you're training the system on;
                          # it should contain alignments for 'train_set'.
 
 num_threads_ubm=32
-nnet3_affix=_cleaned     # affix for exp/nnet3 directory to put iVector stuff in, so it
-                         # becomes exp/nnet3_cleaned or whatever.
+nnet3_affix= # affix for exp/nnet3 directory to put iVector stuff
 
 . ./cmd.sh
 . ./path.sh
-. ./utils/parse_options.sh
+. utils/parse_options.sh
 
 
 gmm_dir=exp/${gmm}
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+ali_dir=exp/${gmm}_ali_${train_set}_sp
 
 for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
   if [ ! -f $f ]; then
@@ -61,7 +59,7 @@ if [ $stage -le 2 ]; then
     utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
   fi
 
-  for datadir in ${train_set}_sp test; do
+  for datadir in ${train_set}_sp ${test_sets}; do
     utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
   done
 
@@ -69,7 +67,7 @@ if [ $stage -le 2 ]; then
   # features; this helps make trained nnets more invariant to test data volume.
   utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
 
-  for datadir in ${train_set}_sp test; do
+  for datadir in ${train_set}_sp ${test_sets}; do
     steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
       --cmd "$train_cmd" data/${datadir}_hires
     steps/compute_cmvn_stats.sh data/${datadir}_hires
@@ -78,75 +76,33 @@ if [ $stage -le 2 ]; then
 fi
 
 if [ $stage -le 3 ]; then
-  echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data"
-  # we have to combine short segments or we won't be able to train chain models
-  # on those segments.
-  utils/data/combine_short_segments.sh \
-     data/${train_set}_sp_hires $min_seg_len data/${train_set}_sp_hires_comb
-
-  # just copy over the CMVN to avoid having to recompute it.
-  cp data/${train_set}_sp_hires/cmvn.scp data/${train_set}_sp_hires_comb/
-  utils/fix_data_dir.sh data/${train_set}_sp_hires_comb/
-fi
-
-if [ $stage -le 4 ]; then
-  echo "$0: selecting segments of hires training data that were also present in the"
-  echo " ... original training data."
-
-  # note, these data-dirs are temporary; we put them in a sub-directory
-  # of the place where we'll make the alignments.
-  temp_data_root=exp/nnet3${nnet3_affix}/tri5
-  mkdir -p $temp_data_root
-
-  utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \
-          data/${train_set}_sp_hires $temp_data_root/${train_set}_hires
-
-  # note: essentially all the original segments should be in the hires data.
-  n1=$(wc -l <data/${train_set}/feats.scp)
-  n2=$(wc -l <$temp_data_root/${train_set}_hires/feats.scp)
-  if [ $n1 != $n2 ]; then
-    echo "$0: warning: number of feats $n1 != $n2, if these are very different it could be bad."
-  fi
-
-  echo "$0: training a system on the hires data for its LDA+MLLT transform, in order to produce the diagonal GMM."
-  if [ -e exp/nnet3${nnet3_affix}/tri5/final.mdl ]; then
-    # we don't want to overwrite old stuff, ask the user to delete it.
-    echo "$0: exp/nnet3${nnet3_affix}/tri5/final.mdl already exists: "
-    echo " ... please delete and then rerun, or use a later --stage option."
-    exit 1;
-  fi
-  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 7 --mllt-iters "2 4 6" \
-     --splice-opts "--left-context=3 --right-context=3" \
-     3000 10000 $temp_data_root/${train_set}_hires data/lang \
-      $gmm_dir exp/nnet3${nnet3_affix}/tri5
-fi
-
-
-if [ $stage -le 5 ]; then
   echo "$0: computing a subset of data to train the diagonal UBM."
-
   mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
   temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
 
   # train a diagonal UBM using a subset of about a quarter of the data
-  # we don't use the _comb data for this as there is no need for compatibility with
-  # the alignments, and using the non-combined data is more efficient for I/O
-  # (no messing about with piped commands).
   num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
   num_utts=$[$num_utts_total/4]
   utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
       $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
 
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
   echo "$0: training the diagonal UBM."
   # Use 512 Gaussians in the UBM.
   steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
     --num-frames 700000 \
     --num-threads $num_threads_ubm \
     ${temp_data_root}/${train_set}_sp_hires_subset 512 \
-    exp/nnet3${nnet3_affix}/tri5 exp/nnet3${nnet3_affix}/diag_ubm
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
 fi
 
-if [ $stage -le 6 ]; then
+if [ $stage -le 4 ]; then
   # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
   # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
   # 100.
@@ -155,54 +111,55 @@ if [ $stage -le 6 ]; then
     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
 fi
 
-if [ $stage -le 7 ]; then
+if [ $stage -le 5 ]; then
   # note, we don't encode the 'max2' in the name of the ivectordir even though
   # that's the data we extract the ivectors from, as it's still going to be
-  # valid for the non-'max2' data, the utterance list is the same.
-  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+  # valid for the non-'max2' data; the utterance list is the same.
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
     utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
   fi
-  # We extract iVectors on the speed-perturbed training data after combining
-  # short segments, which will be what we train the system on.  With
+  # We extract iVectors on the speed-perturbed training data .  With
   # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
   # each of these pairs as one speaker; this gives more diversity in iVectors..
-  # Note that these are extracted 'online'.
+  # Note that these are extracted 'online' (they vary within the utterance).
 
-  # having a larger number of speakers is helpful for generalization, and to
-  # handle per-utterance decoding well (iVector starts at zero).
+  # Having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (the iVector starts at zero at the beginning
+  # of each pseudo-speaker).
   temp_data_root=${ivectordir}
   utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
-    data/${train_set}_sp_hires_comb ${temp_data_root}/${train_set}_sp_hires_comb_max2
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
 
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
-    ${temp_data_root}/${train_set}_sp_hires_comb_max2 \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
     exp/nnet3${nnet3_affix}/extractor $ivectordir
 
   # Also extract iVectors for the test data, but in this case we don't need the speed
-  # perturbation (sp) or small-segment concatenation (comb).
-  for data in test; do
-    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \
+  # perturbation (sp).
+  for data in ${test_sets}; do
+    nspk=$(wc -l <data/${data}_hires/spk2utt)
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
       data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
       exp/nnet3${nnet3_affix}/ivectors_${data}_hires
   done
 fi
 
-if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 9 ]; then
-  echo "$0: $feats already exists.  Refusing to overwrite the features "
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 7 ]; then
+  echo "$0: data/${train_set}_sp/feats.scp already exists.  Refusing to overwrite the features "
   echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
   exit 1;
 fi
 
 
-if [ $stage -le 8 ]; then
+if [ $stage -le 6 ]; then
   echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
   utils/data/perturb_data_dir_speed_3way.sh \
     data/${train_set} data/${train_set}_sp
 fi
 
-if [ $stage -le 9 ]; then
-  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+if [ $stage -le 7 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)"
   steps/make_mfcc.sh --nj $nj \
     --cmd "$train_cmd" data/${train_set}_sp
   steps/compute_cmvn_stats.sh data/${train_set}_sp
@@ -211,26 +168,15 @@ if [ $stage -le 9 ]; then
   utils/fix_data_dir.sh data/${train_set}_sp
 fi
 
-if [ $stage -le 10 ]; then
-  echo "$0: combining short segments of low-resolution speed-perturbed  MFCC data"
-  src=data/${train_set}_sp
-  dest=data/${train_set}_sp_comb
-  utils/data/combine_short_segments.sh $src $min_seg_len $dest
-  # re-use the CMVN stats from the source directory, since it seems to be slow to
-  # re-compute them after concatenating short segments.
-  cp $src/cmvn.scp $dest/
-  utils/fix_data_dir.sh $dest
-fi
-
-if [ $stage -le 11 ]; then
+if [ $stage -le 8 ]; then
   if [ -f $ali_dir/ali.1.gz ]; then
     echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
     echo " ... or use a later --stage option."
     exit 1
   fi
-  echo "$0: aligning with the perturbed, short-segment-combined low-resolution data"
+  echo "$0: aligning with the perturbed low-resolution data"
   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-         data/${train_set}_sp_comb data/lang $gmm_dir $ali_dir
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir
 fi
 
 
diff --git a/egs/gale_arabic/s5b/local/prepare_data.sh b/egs/gale_arabic/s5b/local/prepare_data.sh
new file mode 100755
index 00000000000..aea9ba2dc8e
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/prepare_data.sh
@@ -0,0 +1,104 @@
+#!/bin/bash 
+
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+dir1=/export/corpora/LDC/LDC2013S02/
+dir2=/export/corpora/LDC/LDC2013S07/
+dir3=/export/corpora/LDC/LDC2014S07/
+text1=/export/corpora/LDC/LDC2013T17/
+text2=/export/corpora/LDC/LDC2013T04/
+text3=/export/corpora/LDC/LDC2014T17/
+gale_data=GALE
+
+mkdir -p $gale_data 
+# check that sox is installed 
+which sox  &>/dev/null
+if [[ $? != 0 ]]; then 
+ echo "$0: sox is not installed"; exit 1
+fi
+
+for dvd in $dir1 $dir2 $dir3; do
+  dvd_full_path=$(utils/make_absolute.sh $dvd)
+  if [[ ! -e $dvd_full_path ]]; then 
+    echo "$0: missing $dvd_full_path"; exit 1;
+  fi
+  find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \)  | while read file; do
+    id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}')
+    echo "$id sox $file -r 16000 -t wav - |"
+  done 
+done | sort -u > $gale_data/wav.scp
+echo "$0:data prep audio succeded"
+
+gale_data=$(utils/make_absolute.sh "GALE" );
+top_pwd=`pwd`
+txtdir=$gale_data/txt
+mkdir -p $txtdir; cd $txtdir
+
+for cdx in $text1 $text2 $text3; do
+  echo "$0:Preparing $cdx"
+  if [[ $cdx  == *.tgz ]] ; then
+     tar -xvf $cdx
+  elif [  -d "$cdx" ]; then
+    ln -s $cdx `basename $cdx`
+  else
+    echo "$0:I don't really know what I shall do with $cdx " >&2
+  fi
+done
+
+find -L . -type f -name "*.tdf" | while read file; do
+sed '1,3d' $file  # delete the first 3 lines
+done >  all.tmp$$
+
+perl -e '
+    ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0];
+    open(IN, "$inFile");
+    open(ID, ">$idFile");
+    open(TXT, ">$txtFile");
+    while (<IN>) {
+      @arr= split /\t/,$_;
+      $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
+      $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//;
+      if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";}
+      $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n";
+      next if ($rStart == $rEnd);
+      $id =~ s/.sph//g;
+      print ID $id;
+      print TXT "$arr[7]\n";
+ }' "all.tmp$$ allid.tmp$$ contentall.tmp$$"
+
+perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$
+paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}'  > all_1.tmp$$
+
+
+awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/all
+awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' >  $gale_data/report
+awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/conversational
+
+cd ..;
+rm -fr $txtdir
+cd $top_pwd
+echo "$0:dat a prep text succeeded"
+
+mkdir -p data
+dir=$(utils/make_absolute.sh data/)
+grep -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.test
+grep -v -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.train 
+
+for x in test train; do
+ outdir=data/$x
+ file=$gale_data/all.$x 
+ mkdir -p $outdir
+ awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk 
+ cp -pr $outdir/utt2spk $outdir/spk2utt
+ awk '{print $2 " " $1 " " $3 " " $4}' $file  | sort -u > $outdir/segments
+ awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
+done 
+
+grep -f local/test_list $gale_data/wav.scp > $dir/test/wav.scp
+
+cat $gale_data/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
+ {if (seen[$1]) { print $0}}' > $dir/train/wav.scp
+ 
+echo "$0:data prep split succeeded"
+exit 0
diff --git a/egs/gale_arabic/s5b/local/prepare_dict.sh b/egs/gale_arabic/s5b/local/prepare_dict.sh
new file mode 100755
index 00000000000..47b5869fdf1
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/prepare_dict.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 QCRI (author: Ahmed Ali)
+# Apache 2.0
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+lexicon_url1="http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2";
+lexicon_url2="http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2";
+stage=0
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+mkdir -p $dir data/local/lexicon_data
+
+if [ $stage -le 0 ]; then
+  echo "$0: Downloading text for lexicon... $(date)."
+  wget -P data/local/lexicon_data $lexicon_url1
+  wget -P data/local/lexicon_data $lexicon_url2
+  bzcat data/local/lexicon_data/ar-ar_grapheme_lexicon_2016-02-09.bz2  | sed '1,3d' | awk '{print $1}'  >  data/local/lexicon_data/grapheme_lexicon
+  bzcat data/local/lexicon_data/ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >>  data/local/lexicon_data/grapheme_lexicon
+  cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> data/local/lexicon_data/grapheme_lexicon
+fi
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: processing lexicon text and creating lexicon... $(date)."
+  # remove vowels and  rare alef wasla
+  grep -v [0-9] data/local/lexicon_data/grapheme_lexicon |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon
+  local/prepare_lexicon.py
+fi
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+sed -i '1i<UNK> UNK' $dir/lexicon.txt
+
+echo UNK >> $dir/nonsilence_phones.txt
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
+
+echo "$0: Dictionary preparation succeeded"
diff --git a/egs/gale_arabic/s5b/local/prepare_lexicon.py b/egs/gale_arabic/s5b/local/prepare_lexicon.py
new file mode 100755
index 00000000000..215541585eb
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/prepare_lexicon.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon.
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data','local', 'lexicon_data', 'processed_lexicon')
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line = line.strip()
+        characters = list(line)
+        characters = " ".join(['V' if char == '*' else char for char in characters])
+        lex[line] = characters
+
+with open(os.path.join('data','local','dict', 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + "  " + lex[key] + "\n")
diff --git a/egs/gale_arabic/s5b/local/prepare_lm.sh b/egs/gale_arabic/s5b/local/prepare_lm.sh
new file mode 100755
index 00000000000..6fdf35f471a
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/prepare_lm.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright 2012  Vassil Panayotov
+#           2017  Ewald Enzinger
+# Apache 2.0
+
+. ./path.sh || exit 1
+
+echo "=== Building a language model ..."
+
+dir=data/local/lm/
+text=data/train/text
+lexicon=data/local/dict/lexicon.txt
+# Language model order
+order=3
+
+. utils/parse_options.sh
+
+# Prepare a LM training corpus from the transcripts
+mkdir -p $dir
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 
+  else
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+cat data/train/text | cut -d " " -f 2- >  $dir/text.txt
+cut -d' ' -f1 $lexicon > $dir/wordlist
+
+ngram-count -text $dir/text.txt -order $order -limit-vocab -vocab $dir/wordlist \
+  -unk -map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.gz
+
+#ngram -lm $dir/lm.gz -ppl $dir/dev.txt
+echo "*** Finished building the LM model!"
diff --git a/egs/gale_arabic/s5b/local/score.sh b/egs/gale_arabic/s5b/local/score.sh
index 83366f7c7fc..1d84815fc69 100755
--- a/egs/gale_arabic/s5b/local/score.sh
+++ b/egs/gale_arabic/s5b/local/score.sh
@@ -1,60 +1,6 @@
-#!/bin/bash
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
-# Apache 2.0
-
-[ -f ./path.sh ] && . ./path.sh
-
-# begin configuration section.
-cmd=run.pl
-stage=0
-decode_mbr=true
-word_ins_penalty=0.0
-min_lmwt=7
-max_lmwt=17
-iter=  #some of the scripts from steps/ seem to use it
-#end configuration section.
-
-echo "$0 $#"
-
-[ -f ./path.sh ] && . ./path.sh
-. parse_options.sh || exit 1;
-
-if [ $# -ne 3 ]; then
-  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
-  echo " Options:"
-  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
-  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
-  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
-  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
-  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
-  exit 1;
-fi
 
-data=$1
-lang_or_graph=$2
-dir=$3
-
-symtab=$lang_or_graph/words.txt
-
-for f in $symtab $dir/lat.1.gz $data/text; do
-  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
-done
-
-mkdir -p $dir/scoring/log
-
-cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
-
-$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
-  lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
-  lattice-add-penalty --word-ins-penalty=$word_ins_penalty ark:- ark:- \| \
-  lattice-best-path --word-symbol-table=$symtab \
-    ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
+#!/bin/bash
 
-# Note: the double level of quoting for the sed command
-$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
-   cat $dir/scoring/LMWT.tra \| \
-    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
-    compute-wer --text --mode=present \
-     ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT || exit 1;
 
-exit 0;
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/gale_arabic/s5b/local/wer_output_filter b/egs/gale_arabic/s5b/local/wer_output_filter
new file mode 100755
index 00000000000..cf48b434144
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/wer_output_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Hossein Hadian
+
+# Apache 2.0
+# This script converts a BPE-encoded text to normal text. It is used in scoring
+
+import sys, io
+import string
+
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+for line in infile:
+  words = line.strip().split()
+  words = [word for word in words if '<UNK>' not in word]
+  uttid = words[0]
+  transcript = ' '.join(words[1:])
+  output.write(uttid + ' ' + transcript + '\n')
diff --git a/egs/gale_arabic/s5b/run.sh b/egs/gale_arabic/s5b/run.sh
index c45f5119949..3f12d22495e 100755
--- a/egs/gale_arabic/s5b/run.sh
+++ b/egs/gale_arabic/s5b/run.sh
@@ -3,177 +3,121 @@
 # Copyright 2014 QCRI (author: Ahmed Ali)
 # Apache 2.0
 
-. ./path.sh
-. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
-           ## This relates to the queue.
 num_jobs=120
 num_decode_jobs=40
+decode_gmm=true
+stage=0
+overwrite=false
 
-#NB: You can add whatever number of copora you like. The supported extensions 
-#NB: (formats) are wav and flac. Flac will be converted using sox and in contrast
-#NB: with the old approach, the conversion will be on-the-fly and one-time-only
-#NB: during the parametrization.
-
-#NB: Text corpora scpecification. We support either tgz files, which are unpacked
-#NB: or just plain (already unpacked) directories. The list of transcript is then
-#NB: obtained using find command
-
-#Make sure you edit this section to reflect whers you keep the LDC data on your cluster
-
-#This is CLSP configuration. We add the 2014 GALE data. We got around 2 % 
-#improvement just by including it. The gain might be large if someone would tweak
-# the number of leaves and states and so on.
-
-#audio=(
-#  /export/corpora/LDC/LDC2013S02/
-#  /export/corpora/LDC/LDC2013S07/
-#  /export/corpora/LDC/LDC2014S07/
-#)
-#text=(
-#  /export/corpora/LDC/LDC2013T17
-#  /export/corpora/LDC/LDC2013T04
-#  /export/corpora/LDC/LDC2014T17
-#)
-
-audio=(
-  /data/sls/scratch/amali/data/GALE/LDC2013S02
-  /data/sls/scratch/amali/data/GALE/LDC2013S07
-  /data/sls/scratch/amali/data/GALE/LDC2014S07
-)
-text=(
-  /data/sls/scratch/amali/data/GALE/LDC2013T17.tgz
-  /data/sls/scratch/amali/data/GALE/LDC2013T04.tgz
-  /data/sls/scratch/amali/data/GALE/LDC2014T17.tgz
-)
+dir1=/export/corpora/LDC/LDC2013S02/
+dir2=/export/corpora/LDC/LDC2013S07/
+dir3=/export/corpora/LDC/LDC2014S07/
+text1=/export/corpora/LDC/LDC2013T17/
+text2=/export/corpora/LDC/LDC2013T04/
+text3=/export/corpora/LDC/LDC2014T17/
 
 galeData=GALE
-#prepare the data
-#split train dev test 
-#prepare lexicon and LM 
-
-# You can run the script from here automatically, but it is recommended to run the data preparation,
-# and features extraction manually and and only once.
-# By copying and pasting into your shell.
-
-#copy the audio files to local folder wav and convet flac files to wav
-local/gale_data_prep_audio.sh  "${audio[@]}" $galeData || exit 1;
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+. ./utils/parse_options.sh  # e.g. this parses the above options
+                            # if supplied.
 
-#get the transcription and remove empty prompts and all noise markers  
-local/gale_data_prep_txt.sh  "${text[@]}" $galeData || exit 1;
+if [ $stage -le 0 ]; then
 
-# split the data to reports and conversational and for each class will have rain/dev and test
-local/gale_data_prep_split.sh $galeData  || exit 1;
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
 
-# get all Arabic grapheme dictionaries and add silence and UNK
-local/gale_prep_grapheme_dict.sh  || exit 1;
+  echo "$0: Preparing data..."
+  local/prepare_data.sh --dir1 $dir1 --dir2 $dir2 --dir3 $dir3 \
+                        --text1 $text1 --text2 $text2 --text3 $text3
 
+  echo "$0: Preparing lexicon and LM..." 
+  local/prepare_dict.sh
 
-#prepare the langauge resources
-utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang   || exit 1;
+  utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
 
-# LM training
-local/gale_train_lms.sh || exit 1;
+  local/prepare_lm.sh
 
-local/gale_format_data.sh  || exit 1;
-# G compilation, check LG composition
+  utils/format_lm.sh data/lang data/local/lm/lm.gz \
+                     data/local/dict/lexicon.txt data/lang_test
+fi
 
-# Now make MFCC features.
-# mfccdir should be some place with a largish disk where you
-# want to store MFCC features.
 mfccdir=mfcc
-
-for x in train test ; do
-  steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \
-    data/$x exp/make_mfcc/$x $mfccdir
-  utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
-  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
-done
-
-
-# Here we start the AM
-
-# Let's create a subset with 10k segments to make quick flat-start training:
-utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1;
-
-# Train monophone models on a subset of the data, 10K segment
-# Note: the --boost-silence option should probably be omitted by default
-steps/train_mono.sh --nj 40 --cmd "$train_cmd" \
-  data/train.10K data/lang exp/mono || exit 1;
-
-
-# Get alignments from monophone system.
-steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
-  data/train data/lang exp/mono exp/mono_ali || exit 1;
-
-# train tri1 [first triphone pass]
-steps/train_deltas.sh --cmd "$train_cmd" \
-  2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
-
-# First triphone decoding
-utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
-steps/decode.sh  --nj $num_decode_jobs --cmd "$decode_cmd" \
-  exp/tri1/graph data/test exp/tri1/decode
-  
-steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
-  data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
-
-# Train tri2a, which is deltas+delta+deltas
-steps/train_deltas.sh --cmd "$train_cmd" \
-  3000 40000 data/train data/lang exp/tri1_ali exp/tri2a || exit 1;
-
-# tri2a decoding
-utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph
-steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \
-  exp/tri2a/graph data/test exp/tri2a/decode
-
-# train and decode tri2b [LDA+MLLT]
-steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \
-  data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
-
-utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph
-steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \
-  exp/tri2b/graph data/test exp/tri2b/decode
-
-# Align all data with LDA+MLLT system (tri2b)
-steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
-  --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali  || exit 1;
-
-
-# From 2b system, train 3b which is LDA + MLLT + SAT.
-steps/train_sat.sh --cmd "$train_cmd" \
-  5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1;
-
-utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph
-steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \
-  "$decode_cmd" exp/tri3b/graph data/test exp/tri3b/decode
-
-# From 3b system, align all data.
-steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \
-  data/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
-  
-
-# nnet3 cross-entropy 
-local/nnet3/run_tdnn.sh #tdnn recipe:
-local/nnet3/run_lstm.sh --stage 12  #lstm recipe (we skip ivector training)
-
-# chain lattice-free 
-local/chain/run_tdnn.sh      #tdnn recipe:
-local/chain/run_tdnn_lstm.sh #tdnn-lstm recipe:
-
-time=$(date +"%Y-%m-%d-%H-%M-%S")
-
-#get detailed WER; reports, conversational and combined
-local/split_wer.sh $galeData > RESULTS.details.$USER.$time # to make sure you keep the results timed and owned
-
-echo training succedded
+if [ $stage -le 1 ]; then
+  echo "$0: Preparing the test and train feature files..."
+  for x in train test ; do
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \
+      data/$x exp/make_mfcc/$x $mfccdir
+    utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating sub-set and training monophone system"
+  utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1;
+
+  steps/train_mono.sh --nj 40 --cmd "$train_cmd" \
+    data/train.10K data/lang exp/mono || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Aligning data using monophone system"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/mono exp/mono_ali || exit 1;
+
+  echo "$0: training triphone system with delta features"
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
+fi
+
+if [ $stage -le 4 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
+  steps/decode.sh  --nj $num_decode_jobs --cmd "$decode_cmd" \
+    exp/tri1/graph data/test exp/tri1/decode
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: Aligning data and retraining and realigning with lda_mllt"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \
+    data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
+fi
+
+if [ $stage -le 6 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph
+  steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \
+    exp/tri2b/graph data/test exp/tri2b/decode
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: Aligning data and retraining and realigning with sat_basis"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri2b exp/tri2b_ali || exit 1;
+
+  steps/train_sat_basis.sh --cmd "$train_cmd" \
+    5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1;
+
+  steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
+fi
+
+if [ $stage -le 8 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph
+  steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \
+    "$decode_cmd" exp/tri3b/graph data/test exp/tri3b/decode
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: Training a regular chain model using the e2e alignments..."
+  local/chain/run_tdnn.sh
+fi
+
+echo "$0: training succedded"
 exit 0
-
-#TODO:
-#LM (4-gram and RNN) rescoring
-#combine lattices
-#dialect detection
-
-
-
-
-
diff --git a/egs/heroico/s5/cmd.sh b/egs/heroico/s5/cmd.sh
index a427f3c16a5..533aad25db1 100755
--- a/egs/heroico/s5/cmd.sh
+++ b/egs/heroico/s5/cmd.sh
@@ -10,6 +10,7 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
+export cmd="retry.pl queue.pl"
 export train_cmd="retry.pl queue.pl"
 export decode_cmd="retry.pl queue.pl --mem 2G"
 
diff --git a/egs/heroico/s5/local/heroico_download.sh b/egs/heroico/s5/local/heroico_download.sh
deleted file mode 100755
index 9c58fe37537..00000000000
--- a/egs/heroico/s5/local/heroico_download.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 John Morgan
-# Apache 2.0.
-
-speech=$1
-lexicon=$2
-
-download_dir=$(pwd)
-tmpdir=data/local/tmp
-data_dir=$tmpdir/LDC2006S37/data
-
-mkdir -p $tmpdir
-
-# download the corpus from openslr
-
-if [ ! -f $download_dir/heroico.tar.gz ]; then
-  wget -O $download_dir/heroico.tar.gz $speech
-
-  (
-    cd $download_dir
-    tar -xzf heroico.tar.gz
-  )
-fi
-
-mkdir -p data/local/dict $tmpdir/dict
-
-# download the dictionary from openslr
-
-if [ ! -f $download_dir/santiago.tar.gz ]; then
-    wget -O $download_dir/santiago.tar.gz $lexicon
-fi
-
-(
-  cd $download_dir
-  tar -xzf santiago.tar.gz
-)
diff --git a/egs/heroico/s5/local/subs_prepare_data.pl b/egs/heroico/s5/local/subs_prepare_data.pl
index a7e0cfb0c6e..e39db79f610 100755
--- a/egs/heroico/s5/local/subs_prepare_data.pl
+++ b/egs/heroico/s5/local/subs_prepare_data.pl
@@ -19,7 +19,7 @@
 
 # input and output files
 
-my $corpus = "OpenSubtitles2018.en-es.es";
+my $corpus = "OpenSubtitles.en-es.es";
 my $symbol_table = "data/lang/words.txt";
 my $filtered = "data/local/tmp/subs/lm/es.txt";
 my $oovs = "data/local/tmp/subs/lm/oovs.txt";
diff --git a/egs/heroico/s5/run.sh b/egs/heroico/s5/run.sh
index 67ad87e55f9..4cc5617e985 100755
--- a/egs/heroico/s5/run.sh
+++ b/egs/heroico/s5/run.sh
@@ -9,11 +9,11 @@ stage=0
 datadir=/export/corpora5/LDC/LDC2006S37
 
 # The corpus and lexicon are on openslr.org
-speech="http://www.openslr.org/resources/39/LDC2006S37.tar.gz"
-lexicon="http://www.openslr.org/resources/34/santiago.tar.gz"
+#speech_url="http://www.openslr.org/resources/39/LDC2006S37.tar.gz"
+lexicon_url="http://www.openslr.org/resources/34/santiago.tar.gz"
 
 # Location of the Movie subtitles text corpus
-subs_src="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2018/en-es.txt.zip"
+subtitles_url="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2018/en-es.txt.zip"
 
 . utils/parse_options.sh
 
@@ -26,14 +26,22 @@ set -u
 tmpdir=data/local/tmp
 
 if [ $stage -le 0 ]; then
-  # download the corpus from openslr
-  local/heroico_download.sh $speech $lexicon
+  if [ ! -d $datadir ]; then
+    echo "$0: please download and un-tar http://www.openslr.org/resources/39/LDC2006S37.tar.gz"
+    echo "  and set $datadir to the directory where it is located."
+    exit 1
+  fi
+  if [ ! -s santiago.txt ]; then
+    echo "$0: downloading the lexicon"
+    wget -c http://www.openslr.org/resources/34/santiago.tar.gz
+    tar -xvzf santiago.tar.gz
+  fi
   # Get data for lm training
-  local/subs_download.sh $subs_src
+  local/subs_download.sh $subtitles_url
 fi
 
 if [ $stage -le 1 ]; then
-  echo "Makin lists for building models."
+  echo "Making lists for building models."
   local/prepare_data.sh $datadir
 fi
 
diff --git a/egs/iam/v1/RESULTS b/egs/iam/v1/RESULTS
new file mode 100644
index 00000000000..b25cb3cd772
--- /dev/null
+++ b/egs/iam/v1/RESULTS
@@ -0,0 +1,42 @@
+Run_end2end.sh (WER using lang_test, lang_unk)
+flat_start:
+  • %WER 14.41 [ 2671 / 18542, 262 ins, 561 del, 1848 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_11_1.0
+  • %WER 15.21 [ 2821 / 18542, 375 ins, 500 del, 1946 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_11_1.0
+
+cnn_e2eali_1a: 
+  • %WER 11.94 [ 2214 / 18542, 267 ins, 380 del, 1567 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_9_1.0
+  • %WER 13.30 [ 2467 / 18542, 441 ins, 330 del, 1696 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_9_0.5
+
+cnn_e2eali_1b: 
+  • %WER 11.20 [ 2076 / 18542, 260 ins, 335 del, 1481 sub ] exp/chain/cnn_e2eali_1b/decode_test/wer_9_1.0
+  • %WER 12.46 [ 2311 / 18542, 371 ins, 326 del, 1614 sub ] exp/chain/cnn_e2eali_1b/decode_test/wer_9_1.0
+
+cnn_e2eali_1c: 
+  • %WER 9.90 [ 1836 / 18542, 257 ins, 227 del, 1352 sub ] exp/chain/cnn_e2eali_1c/decode_test/wer_10_1.0
+  • %WER 12.10 [ 2243 / 18542, 411 ins, 269 del, 1563 sub ] exp/chain/cnn_e2eali_1c/decode_test/wer_12_0.5
+
+
+Run.sh (WER using lang_test, lang_unk)
+cnn_1a:
+  • %WER 15.18 [ 2815 / 18542, 285 ins, 509 del, 2021 sub ] exp/chain/cnn_1a/decode_test/wer_11_0.0
+  • %WER 16.88 [ 3130 / 18542, 444 ins, 611 del, 2075 sub ] exp/chain/cnn_1a/decode_test/wer_11_0.0
+
+cnn_chainali_1a:
+  • %WER 14.09 [ 2612 / 18542, 245 ins, 505 del, 1862 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_13_0.0
+  • %WER 15.93 [ 2954 / 18542, 454 ins, 470 del, 2030 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_10_0.0
+
+cnn_chainali_1b:
+  • %WER 13.29 [ 2465 / 18542, 221 ins, 499 del, 1745 sub ] exp/chain/cnn_chainali_1b/decode_test/wer_12_0.5
+  • %WER 15.09 [ 2798 / 18542, 418 ins, 468 del, 1912 sub ] exp/chain/cnn_chainali_1b/decode_test/wer_10_0.5
+
+cnn_chainali_1c:
+  • %WER 11.59 [ 2149 / 18542, 276 ins, 362 del, 1511 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_9_0.0
+  • %WER 13.75 [ 2550 / 18542, 465 ins, 368 del, 1717 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_8_0.0
+  
+cnn_chainali_1d:
+  • %WER 11.07 [ 2053 / 18542, 261 ins, 311 del, 1481 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_9_0.0
+  • %WER 12.95 [ 2402 / 18542, 436 ins, 313 del, 1653 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_8_0.0
+
+cnn_chainali_1e:
+  • %WER 10.03 [ 1859 / 18542, 226 ins, 291 del, 1342 sub ] exp/chain/cnn_chainali_1e/decode_test/wer_11_0.5
+    %WER 12.15 [ 2253 / 18542, 406 ins, 282 del, 1565 sub ] exp/chain/cnn_chainali_1e/decode_test/wer_10_0.5
diff --git a/egs/iam/v1/local/augment_data.sh b/egs/iam/v1/local/augment_data.sh
new file mode 100755
index 00000000000..31e4a8217ca
--- /dev/null
+++ b/egs/iam/v1/local/augment_data.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Copyright   2018 Hossein Hadian
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script performs data augmentation.
+
+nj=4
+cmd=run.pl
+feat_dim=40
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+srcdir=$1
+outdir=$2
+datadir=$3
+aug_set=aug1
+mkdir -p $datadir/augmentations
+echo "copying $srcdir to $datadir/augmentations/$aug_set, allowed length, creating feats.scp"
+
+for set in $aug_set; do
+  image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \
+    $srcdir $datadir/augmentations/$set
+  cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \
+    --fliplr false --augment true $datadir/augmentations/$set
+done
+
+echo " combine original data and data from different augmentations"
+utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/$aug_set
+cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt
diff --git a/egs/iam/v1/local/chain/compare_wer.sh b/egs/iam/v1/local/chain/compare_wer.sh
index ad90710b13f..4a2cc29481c 100755
--- a/egs/iam/v1/local/chain/compare_wer.sh
+++ b/egs/iam/v1/local/chain/compare_wer.sh
@@ -34,6 +34,20 @@ for x in $*; do
 done
 echo
 
+echo -n "# WER val                    "
+for x in $*; do
+  wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER val                    "
+for x in $*; do
+  cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
 if $used_epochs; then
   exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
 fi
diff --git a/egs/iam/v1/local/chain/run_cnn.sh b/egs/iam/v1/local/chain/run_cnn.sh
new file mode 120000
index 00000000000..df6f0a468c1
--- /dev/null
+++ b/egs/iam/v1/local/chain/run_cnn.sh
@@ -0,0 +1 @@
+tuning/run_cnn_1a.sh
\ No newline at end of file
diff --git a/egs/iam/v1/local/chain/run_cnn_chainali.sh b/egs/iam/v1/local/chain/run_cnn_chainali.sh
new file mode 120000
index 00000000000..41b712609c2
--- /dev/null
+++ b/egs/iam/v1/local/chain/run_cnn_chainali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_chainali_1d.sh
\ No newline at end of file
diff --git a/egs/iam/v1/local/chain/run_cnn_e2eali.sh b/egs/iam/v1/local/chain/run_cnn_e2eali.sh
new file mode 120000
index 00000000000..ad51803ab0e
--- /dev/null
+++ b/egs/iam/v1/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1c.sh
\ No newline at end of file
diff --git a/egs/iam/v1/local/chain/run_e2e_cnn.sh b/egs/iam/v1/local/chain/run_e2e_cnn.sh
new file mode 120000
index 00000000000..d26ba0182ce
--- /dev/null
+++ b/egs/iam/v1/local/chain/run_e2e_cnn.sh
@@ -0,0 +1 @@
+tuning/run_e2e_cnn_1a.sh
\ No newline at end of file
diff --git a/egs/iam/v1/local/chain/run_cnn_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh
similarity index 81%
rename from egs/iam/v1/local/chain/run_cnn_1a.sh
rename to egs/iam/v1/local/chain/tuning/run_cnn_1a.sh
index 41a76920e37..1253bbe5aa3 100755
--- a/egs/iam/v1/local/chain/run_cnn_1a.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh
@@ -4,23 +4,23 @@
 #              2017 Chun Chieh Chang
 #              2017 Ashish Arora
 
-# steps/info/chain_dir_info.pl exp/chain/cnn_1a/
-# exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.021->-0.015 xent:train/valid[13,20,final]=(-1.05,-0.701,-0.591/-1.30,-1.08,-1.00) logprob:train/valid[13,20,final]=(-0.061,-0.034,-0.030/-0.107,-0.101,-0.098)
-
 # local/chain/compare_wer.sh exp/chain/cnn_1a/
-# System                         cnn_1a
-# WER                             18.52
-# CER                             10.07
-# Final train prob              -0.0077
-# Final valid prob              -0.0970
-# Final train prob (xent)       -0.5484
-# Final valid prob (xent)       -0.9643
-# Parameters                      4.36M
+# System                         cnn_1a(dict_50k)      cnn_1a(dict_50k + unk model)
+# WER                              16.88                    15.18
+# CER                               8.52                    7.58
+# WER val                          16.17                    13.53
+# CER val                           7.15                    5.89
+# Final train prob               -0.0299
+# Final valid prob               -0.0574
+# Final train prob (xent)        -0.3912
+# Final valid prob (xent)        -0.6439
+# Parameters                       4.36M
 
-set -e -o pipefail
+# steps/info/chain_dir_info.pl exp/chain/cnn_1a/
+# exp/chain/cnn_1a/: num-iters=42 nj=2..4 num-params=4.4M dim=40->368 combine=-0.029->-0.029 (over 2) xent:train/valid[27,41,final]=(-0.522,-0.394,-0.391/-0.695,-0.644,-0.644) logprob:train/valid[27,41,final]=(-0.035,-0.030,-0.030/-0.056,-0.057,-0.057)
 
+set -e -o pipefail
 stage=0
-
 nj=30
 train_set=train
 gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
@@ -34,28 +34,21 @@ reporting_email=
 # chain options
 train_stage=-10
 xent_regularize=0.1
-frame_subsampling_factor=4
-alignment_subsampling_factor=1
 # training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
 # training options
-srand=0
-remove_egs=false
-lang_test=lang_unk
+lang_decode=lang_unk
+decode_val=true
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -109,7 +102,7 @@ fi
 if [ $stage -le 2 ]; then
   # Get the alignments as lattices (gives the chain training more freedom).
   # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" ${train_data_dir} \
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" $train_data_dir \
     data/lang $gmm_dir $lat_dir
   rm $lat_dir/fsts.*.gz # save space
 fi
@@ -124,9 +117,9 @@ if [ $stage -le 3 ]; then
      exit 1;
   fi
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --context-opts "--context-width=2 --central-position=1" \
-    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    --cmd "$cmd" $num_leaves $train_data_dir \
     $lang $ali_dir $tree_dir
 fi
 
@@ -134,7 +127,6 @@ fi
 if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
-
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
   learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
   common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
@@ -155,7 +147,6 @@ if [ $stage -le 4 ]; then
   ## adding the layers for chain branch
   relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5
   output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
-
   # adding the layers for xent branch
   # This block prints the configs for a separate output that will be
   # trained with a cross-entropy objective in the 'chain' mod?els... this
@@ -186,9 +177,9 @@ if [ $stage -le 5 ]; then
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
     --chain.lm-opts="--num-extra-lm-states=500" \
-    --chain.frame-subsampling-factor=$frame_subsampling_factor \
-    --chain.alignment-subsampling-factor=$frame_subsampling_factor \
-    --trainer.srand=$srand \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=4 \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=4 \
     --trainer.frames-per-iter=1000000 \
@@ -198,15 +189,10 @@ if [ $stage -le 5 ]; then
     --trainer.optimization.final-effective-lrate=0.0001 \
     --trainer.optimization.shrink-value=1.0 \
     --trainer.num-chunk-per-minibatch=64,32 \
-    --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
-    --cleanup.remove-egs=$remove_egs \
+    --cleanup.remove-egs=false \
     --use-gpu=true \
     --reporting.email="$reporting_email" \
     --feat-dir=$train_data_dir \
@@ -222,20 +208,20 @@ if [ $stage -le 6 ]; then
   # topology file from the model).  So you could give it a different
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
-
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
-    --frames-per-chunk $frames_per_chunk \
-    --nj $nj --cmd "$cmd" \
-    $dir/graph data/test $dir/decode_test || exit 1;
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
 fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh
similarity index 81%
rename from egs/iam/v1/local/chain/run_cnn_chainali_1a.sh
rename to egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh
index ee3a1a3d92c..a8d7f6c6091 100755
--- a/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh
@@ -1,13 +1,26 @@
 #!/bin/bash
 
 # chainali_1a is as 1a except it uses chain alignments (using 1a system) instead of gmm alignments
-
+# local/chain/compare_wer.sh exp/chain/cnn_chainali_1a
+# System                      cnn_chainali_1a (dict_50k)        cnn_chainali_1a(dict_50k + unk_model)
+# WER                             15.93                             14.09
+# CER                              7.79                              6.70
+# WER val                         15.10                             12.63
+# CER val                          6.72                              5.36
+# Final train prob              -0.0220
+# Final valid prob              -0.0157
+# Final train prob (xent)       -0.4238
+# Final valid prob (xent)       -0.6119
+# Parameters                      4.36M
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1a
+# exp/chain/cnn_chainali_1a: num-iters=42 nj=2..4 num-params=4.4M dim=40->368 combine=-0.020->-0.020 (over 2) xent:train/valid[27,41,final]=(-0.534,-0.425,-0.424/-0.659,-0.612,-0.612) logprob:train/valid[27,41,final]=(-0.026,-0.022,-0.022/-0.017,-0.016,-0.016)
 set -e -o pipefail
 
 stage=0
-
 nj=30
 train_set=train
+decode_val=true
 gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
                 # should have alignments for the specified training data.
 nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
@@ -20,28 +33,18 @@ reporting_email=
 # chain options
 train_stage=-10
 xent_regularize=0.1
-frame_subsampling_factor=4
-alignment_subsampling_factor=1
-# training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
-# training options
-srand=0
-remove_egs=false
-lang_test=lang_test
+lang_decode=lang_unk
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -98,7 +101,7 @@ if [ $stage -le 2 ]; then
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
                             --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
-                            ${train_data_dir} data/lang $chain_model_dir $lat_dir
+                            $train_data_dir data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
 
@@ -112,17 +115,15 @@ if [ $stage -le 3 ]; then
      exit 1;
   fi
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --context-opts "--context-width=2 --central-position=1" \
-    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    --cmd "$cmd" $num_leaves $train_data_dir \
     $lang $ali_dir $tree_dir
 fi
 
-
 if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
-
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
   learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
   common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
@@ -131,7 +132,6 @@ if [ $stage -le 4 ]; then
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
   input dim=40 name=input
-
   conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
   conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
   conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
@@ -160,7 +160,6 @@ EOF
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
 fi
 
-
 if [ $stage -le 5 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
@@ -175,9 +174,9 @@ if [ $stage -le 5 ]; then
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
     --chain.lm-opts="--num-extra-lm-states=500" \
-    --chain.frame-subsampling-factor=$frame_subsampling_factor \
-    --chain.alignment-subsampling-factor=$alignment_subsampling_factor \
-    --trainer.srand=$srand \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=1 \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=4 \
     --trainer.frames-per-iter=1000000 \
@@ -187,15 +186,10 @@ if [ $stage -le 5 ]; then
     --trainer.optimization.final-effective-lrate=0.0001 \
     --trainer.optimization.shrink-value=1.0 \
     --trainer.num-chunk-per-minibatch=64,32 \
-    --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
-    --cleanup.remove-egs=$remove_egs \
+    --cleanup.remove-egs=false \
     --use-gpu=true \
     --reporting.email="$reporting_email" \
     --feat-dir=$train_data_dir \
@@ -211,20 +205,20 @@ if [ $stage -le 6 ]; then
   # topology file from the model).  So you could give it a different
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
-
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
-    --frames-per-chunk $frames_per_chunk \
-    --nj $nj --cmd "$cmd" \
-    $dir/graph data/test $dir/decode_test || exit 1;
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
 fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh
similarity index 80%
rename from egs/iam/v1/local/chain/run_cnn_chainali_1b.sh
rename to egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh
index c6876fbafcb..f5dbb93e7b7 100755
--- a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh
@@ -1,27 +1,26 @@
 #!/bin/bash
 
 # chainali_1b is as chainali_1a except it has 3 more cnn layers and 1 less tdnn layer.
-
-# local/chain/compare_wer.sh exp/chain/cnn_1a/ exp/chain/cnn_chainali_1b/
-# System                         cnn_1a cnn_chainali_1b
-# WER                             18.52     14.38
-# CER                             10.07      7.14
-# Final train prob              -0.0077   -0.0113
-# Final valid prob              -0.0970   -0.0400
-# Final train prob (xent)       -0.5484   -0.6043
-# Final valid prob (xent)       -0.9643   -0.9030
-# Parameters                      4.36M     3.96M
+# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b
+# System                      cnn_chainali_1b(dict_50k)   cnn_chainali_1b(dict_50k + unk_model)
+# WER                             15.09                       13.29
+# CER                              7.13                        6.08
+# WER val                         14.80                       11.98
+# CER val                          6.16                        4.87
+# Final train prob              -0.0225
+# Final valid prob              -0.0132
+# Final train prob (xent)       -0.4466
+# Final valid prob (xent)       -0.6048
+# Parameters                      3.96M
 
 # steps/info/chain_dir_info.pl exp/chain/chainali_cnn_1b/
-# exp/chain/chainali_cnn_1b/: num-iters=21 nj=2..4 num-params=4.0M dim=40->364 combine=-0.009->-0.005 xent:train/valid[13,20,final]=(-1.47,-0.728,-0.623/-1.69,-1.02,-0.940) logprob:train/valid[13,20,final]=(-0.068,-0.030,-0.011/-0.086,-0.056,-0.038)
-
+# exp/chain/cnn_chainali_1b: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.019->-0.019 (over 2) xent:train/valid[27,41,final]=(-0.545,-0.448,-0.447/-0.645,-0.605,-0.605) logprob:train/valid[27,41,final]=(-0.026,-0.023,-0.023/-0.014,-0.013,-0.013)
 
 set -e -o pipefail
-
 stage=0
-
 nj=30
 train_set=train
+decode_val=true
 gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
                 # should have alignments for the specified training data.
 nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
@@ -31,31 +30,20 @@ chain_model_dir=exp/chain${nnet3_affix}/cnn_1a
 common_egs_dir=
 reporting_email=
 
-# chain options
 train_stage=-10
 xent_regularize=0.1
-frame_subsampling_factor=4
-alignment_subsampling_factor=1
-# training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
-# training options
-srand=0
-remove_egs=false
-lang_test=lang_unk
+lang_decode=lang_unk
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -82,7 +70,6 @@ for f in $train_data_dir/feats.scp \
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
 done
 
-
 if [ $stage -le 1 ]; then
   echo "$0: creating lang directory $lang with chain-type topology"
   # Create a version of the lang/ directory that has one state per phone in the
@@ -112,7 +99,7 @@ if [ $stage -le 2 ]; then
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
                             --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
-                            ${train_data_dir} data/lang $chain_model_dir $lat_dir
+                            $train_data_dir data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
 
@@ -126,9 +113,9 @@ if [ $stage -le 3 ]; then
      exit 1;
   fi
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --context-opts "--context-width=2 --central-position=1" \
-    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    --cmd "$cmd" $num_leaves $train_data_dir \
     $lang $ali_dir $tree_dir
 fi
 
@@ -136,7 +123,6 @@ fi
 if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
-
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
   learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
   common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
@@ -145,7 +131,6 @@ if [ $stage -le 4 ]; then
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
   input dim=40 name=input
-
   conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
   conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
   conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
@@ -160,7 +145,6 @@ if [ $stage -le 4 ]; then
   ## adding the layers for chain branch
   relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5
   output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
-
   # adding the layers for xent branch
   # This block prints the configs for a separate output that will be
   # trained with a cross-entropy objective in the 'chain' mod?els... this
@@ -191,9 +175,9 @@ if [ $stage -le 5 ]; then
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
     --chain.lm-opts="--num-extra-lm-states=500" \
-    --chain.frame-subsampling-factor=$frame_subsampling_factor \
-    --chain.alignment-subsampling-factor=$alignment_subsampling_factor \
-    --trainer.srand=$srand \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=1 \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=4 \
     --trainer.frames-per-iter=1000000 \
@@ -203,15 +187,10 @@ if [ $stage -le 5 ]; then
     --trainer.optimization.final-effective-lrate=0.0001 \
     --trainer.optimization.shrink-value=1.0 \
     --trainer.num-chunk-per-minibatch=64,32 \
-    --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
-    --cleanup.remove-egs=$remove_egs \
+    --cleanup.remove-egs=false \
     --use-gpu=true \
     --reporting.email="$reporting_email" \
     --feat-dir=$train_data_dir \
@@ -227,20 +206,20 @@ if [ $stage -le 6 ]; then
   # topology file from the model).  So you could give it a different
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
-
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
-    --frames-per-chunk $frames_per_chunk \
-    --nj $nj --cmd "$cmd" \
-    $dir/graph data/test $dir/decode_test || exit 1;
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
 fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh
similarity index 81%
rename from egs/iam/v1/local/chain/run_cnn_chainali_1c.sh
rename to egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh
index 54c52d913de..1dd83c5078f 100755
--- a/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh
@@ -1,25 +1,25 @@
 #!/bin/bash
 
 # chainali_1c is as chainali_1b except it uses l2-regularize
-# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b exp/chain/cnn_chainali_1c
-# System                      cnn_chainali_1b cnn_chainali_1c
-# WER                             14.38     12.72
-# CER                              7.14      5.99
-# Final train prob              -0.0113   -0.0291
-# Final valid prob              -0.0400   -0.0359
-# Final train prob (xent)       -0.6043   -0.9781
-# Final valid prob (xent)       -0.9030   -1.1544
-# Parameters                      3.96M     3.96M
+# local/chain/compare_wer.sh exp/chain/cnn_chainali_1c
+# System                      cnn_chainali_1c (dict_50k)        cnn_chainali_1c(dict_50k + unk_model)
+# WER                             12.95                             11.07
+# CER                              6.04                              4.91
+# WER val                         12.75                              9.78
+# CER val                          5.15                              3.74
+# Final train prob              -0.0217
+# Final valid prob              -0.0060
+# Final train prob (xent)       -0.8303
+# Final valid prob (xent)       -0.8665
+# Parameters                      3.96M
 
 # steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1c
-# exp/chain/cnn_chainali_1c: num-iters=21 nj=2..4 num-params=4.0M dim=40->369 combine=-0.007->-0.007 (over 1) xent:train/valid[13,20,final]=(-1.44,-1.05,-0.997/-1.53,-1.19,-1.15) logprob:train/valid[13,20,final]=(-0.056,-0.020,-0.012/-0.056,-0.025,-0.020)
-
+# exp/chain/cnn_chainali_1c/: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.018->-0.018 (over 1) xent:train/valid[27,41,final]=(-1.22,-0.847,-0.830/-1.19,-0.880,-0.867) logprob:train/valid[27,41,final]=(-0.045,-0.025,-0.022/-0.026,-0.010,-0.006)
 set -e -o pipefail
-
 stage=0
-
 nj=30
 train_set=train
+decode_val=true
 gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
                 # should have alignments for the specified training data.
 nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
@@ -29,30 +29,20 @@ chain_model_dir=exp/chain${nnet3_affix}/cnn_1a
 common_egs_dir=
 reporting_email=
 
-# chain options
 train_stage=-10
 xent_regularize=0.1
-frame_subsampling_factor=4
-# training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
-# training options
-srand=0
-remove_egs=false
-lang_test=lang_unk
+lang_decode=lang_unk
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -108,7 +98,7 @@ if [ $stage -le 2 ]; then
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
                             --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
-                            ${train_data_dir} data/lang $chain_model_dir $lat_dir
+                            $train_data_dir data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
 
@@ -122,17 +112,15 @@ if [ $stage -le 3 ]; then
      exit 1;
   fi
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --context-opts "--context-width=2 --central-position=1" \
-    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    --cmd "$cmd" $num_leaves $train_data_dir \
     $lang $ali_dir $tree_dir
 fi
 
-
 if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
-
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
   learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
   cnn_opts="l2-regularize=0.075"
@@ -190,11 +178,11 @@ if [ $stage -le 5 ]; then
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
     --chain.lm-opts="--num-extra-lm-states=500" \
-    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.frame-subsampling-factor=4 \
     --chain.alignment-subsampling-factor=1 \
     --chain.left-tolerance 3 \
     --chain.right-tolerance 3 \
-    --trainer.srand=$srand \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=4 \
     --trainer.frames-per-iter=1000000 \
@@ -206,13 +194,9 @@ if [ $stage -le 5 ]; then
     --trainer.num-chunk-per-minibatch=64,32 \
     --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
-    --cleanup.remove-egs=$remove_egs \
+    --cleanup.remove-egs=false \
     --use-gpu=true \
     --reporting.email="$reporting_email" \
     --feat-dir=$train_data_dir \
@@ -228,20 +212,20 @@ if [ $stage -le 6 ]; then
   # topology file from the model).  So you could give it a different
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
-
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
-    --frames-per-chunk $frames_per_chunk \
-    --nj $nj --cmd "$cmd" \
-    $dir/graph data/test $dir/decode_test || exit 1;
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
 fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1d.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh
similarity index 79%
rename from egs/iam/v1/local/chain/run_cnn_chainali_1d.sh
rename to egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh
index 19de3af7f1d..3979b3d2da0 100755
--- a/egs/iam/v1/local/chain/run_cnn_chainali_1d.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh
@@ -1,60 +1,53 @@
 #!/bin/bash
 
 # chainali_1d is as chainali_1c except it uses unconstrained egs
-
-# local/chain/compare_wer.sh /home/hhadian/kaldi-rnnlm/egs/iam/v1/exp/chain/cnn_chainali_1c exp/chain/cnn_chainali_1d
-# System                      cnn_chainali_1c cnn_chainali_1d
-# WER                             13.14     12.33
-# CER                              6.40      5.72
-# Final train prob              -0.0260   -0.0037
-# Final valid prob              -0.0451   -0.0132
-# Final train prob (xent)       -0.9993   -0.8647
-# Final valid prob (xent)       -1.1549   -1.0101
-# Parameters                      3.97M     3.97M
+# local/chain/compare_wer.sh exp/chain/cnn_chainali_1d
+# System                      cnn_chainali_1d (dict_50k)        cnn_chainali_1d(dict_50k + unk_model)
+# WER                             12.95                             11.07
+# CER                              6.04                              4.91
+# WER val                         12.75                              9.78
+# CER val                          5.15                              3.74
+# Final train prob              -0.0217
+# Final valid prob              -0.0060
+# Final train prob (xent)       -0.8303
+# Final valid prob (xent)       -0.8665
+# Parameters                      3.96M
 
 # steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1d
-# exp/chain/cnn_chainali_1d: num-iters=21 nj=2..4 num-params=4.0M dim=40->376 combine=-0.002->-0.002 (over 1) xent:train/valid[13,20,final]=(-1.66,-1.01,-0.865/-1.72,-1.12,-1.01) logprob:train/valid[13,20,final]=(-0.058,-0.019,-0.004/-0.055,-0.027,-0.013)
-
+# exp/chain/cnn_chainali_1d/: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.018->-0.018 (over 1) xent:train/valid[27,41,final]=(-1.22,-0.847,-0.830/-1.19,-0.880,-0.867) logprob:train/valid[27,41,final]=(-0.045,-0.025,-0.022/-0.026,-0.010,-0.006)
 
 set -e -o pipefail
 
 stage=0
-
 nj=30
 train_set=train
 gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
                 # should have alignments for the specified training data.
 nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
-affix=_1c_uc  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+affix=_1d  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
 ali=tri3_ali
-chain_model_dir=exp/chain${nnet3_affix}/cnn_1a_uc
+chain_model_dir=exp/chain${nnet3_affix}/cnn_1a
 common_egs_dir=
 reporting_email=
 
 # chain options
 train_stage=-10
 xent_regularize=0.1
-frame_subsampling_factor=4
 # training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
-# training options
-srand=0
-remove_egs=false
-lang_test=lang_unk
+lang_decode=lang_unk
+decode_val=true
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
+
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -80,7 +73,6 @@ for f in $train_data_dir/feats.scp \
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
 done
 
-
 if [ $stage -le 1 ]; then
   echo "$0: creating lang directory $lang with chain-type topology"
   # Create a version of the lang/ directory that has one state per phone in the
@@ -110,7 +102,7 @@ if [ $stage -le 2 ]; then
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
                             --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
-                            ${train_data_dir} data/lang $chain_model_dir $lat_dir
+                            $train_data_dir data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
 
@@ -124,13 +116,12 @@ if [ $stage -le 3 ]; then
      exit 1;
   fi
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --context-opts "--context-width=2 --central-position=1" \
-    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    --cmd "$cmd" $num_leaves $train_data_dir \
     $lang $ali_dir $tree_dir
 fi
 
-
 if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
@@ -146,7 +137,6 @@ if [ $stage -le 4 ]; then
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
   input dim=40 name=input
-
   conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
   conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
   conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
@@ -157,7 +147,6 @@ if [ $stage -le 4 ]; then
   relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts
   relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
   relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
-
   ## adding the layers for chain branch
   relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
   output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
@@ -192,11 +181,11 @@ if [ $stage -le 5 ]; then
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
     --chain.lm-opts="--num-extra-lm-states=500" \
-    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.frame-subsampling-factor=4 \
     --chain.alignment-subsampling-factor=1 \
     --chain.left-tolerance 3 \
     --chain.right-tolerance 3 \
-    --trainer.srand=$srand \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=4 \
     --trainer.frames-per-iter=1000000 \
@@ -206,15 +195,10 @@ if [ $stage -le 5 ]; then
     --trainer.optimization.final-effective-lrate=0.0001 \
     --trainer.optimization.shrink-value=1.0 \
     --trainer.num-chunk-per-minibatch=64,32 \
-    --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
-    --cleanup.remove-egs=$remove_egs \
+    --cleanup.remove-egs=false \
     --use-gpu=true \
     --reporting.email="$reporting_email" \
     --feat-dir=$train_data_dir \
@@ -230,20 +214,20 @@ if [ $stage -le 6 ]; then
   # topology file from the model).  So you could give it a different
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
-
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
-    --frames-per-chunk $frames_per_chunk \
-    --nj $nj --cmd "$cmd" \
-    $dir/graph data/test $dir/decode_test || exit 1;
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
 fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/chain/run_cnn_e2eali_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
similarity index 80%
rename from egs/iam/v1/local/chain/run_cnn_e2eali_1a.sh
rename to egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index ba28f681708..f95f6a90ca1 100755
--- a/egs/iam/v1/local/chain/run_cnn_e2eali_1a.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -1,27 +1,26 @@
 #!/bin/bash
 
-# e2eali_1a is the same as chainali_1c but uses the e2e chain model to get the
-# lattice alignments and to build a tree
-
-# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a exp/chain/cnn_chainali_1c exp/chain/cnn_e2eali_1a
-# System                      e2e_cnn_1a cnn_chainali_1c cnn_e2eali_1a
-# WER                             13.87     12.72     12.70
-# CER                              6.54      5.99      5.75
-# Final train prob              -0.0371   -0.0291   -0.0557
-# Final valid prob              -0.0636   -0.0359   -0.0770
-# Final train prob (xent)                 -0.9781   -0.8847
-# Final valid prob (xent)                 -1.1544   -1.0370
-# Parameters                      9.13M     3.96M     3.95M
+# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a
+# System                      cnn_e2eali_1a_(dict_50k) cnn_e2eali_1a_(dict_50k + unk model)
+# WER                             13.30                    11.94
+# CER                              5.95                     5.15
+# WER val                         12.85                    10.71
+# CER val                          5.09                     4.03
+# Final train prob              -0.0562
+# Final valid prob              -0.0634
+# Final train prob (xent)       -0.8196
+# Final valid prob (xent)       -0.8816
+# Parameters                      3.96M
 
 # steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a
-# exp/chain/cnn_e2eali_1a: num-iters=21 nj=2..4 num-params=4.0M dim=40->360 combine=-0.056->-0.056 (over 1) xent:train/valid[13,20,final]=(-1.47,-0.978,-0.918/-1.54,-1.10,-1.06) logprob:train/valid[13,20,final]=(-0.106,-0.065,-0.056/-0.113,-0.086,-0.079)
+# exp/chain/cnn_e2eali_1a: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.058->-0.058 (over 1) xent:train/valid[27,41,final]=(-2.67,-0.841,-0.820/-2.71,-0.892,-0.882) logprob:train/valid[27,41,final]=(-0.240,-0.060,-0.056/-0.245,-0.068,-0.063)
 
 set -e -o pipefail
 
 stage=0
-
 nj=30
 train_set=train
+decode_val=true
 nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
 affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
 e2echain_model_dir=exp/chain/e2e_cnn_1a
@@ -32,26 +31,19 @@ reporting_email=
 train_stage=-10
 xent_regularize=0.1
 frame_subsampling_factor=4
-# training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
-# training options
-srand=0
 remove_egs=true
-lang_test=lang_unk
+lang_decode=lang_unk
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -74,7 +66,6 @@ for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
 done
 
-
 if [ $stage -le 1 ]; then
   echo "$0: creating lang directory $lang with chain-type topology"
   # Create a version of the lang/ directory that has one state per phone in the
@@ -119,18 +110,16 @@ if [ $stage -le 3 ]; then
   fi
 
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --alignment-subsampling-factor 1 \
     --context-opts "--context-width=2 --central-position=1" \
-    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    --cmd "$cmd" $num_leaves $train_data_dir \
     $lang $ali_dir $tree_dir
 fi
 
-
 if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
-
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
   learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
   cnn_opts="l2-regularize=0.075"
@@ -188,11 +177,11 @@ if [ $stage -le 5 ]; then
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
     --chain.lm-opts="--num-extra-lm-states=500" \
-    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.frame-subsampling-factor=4 \
     --chain.alignment-subsampling-factor=1 \
     --chain.left-tolerance 3 \
     --chain.right-tolerance 3 \
-    --trainer.srand=$srand \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=4 \
     --trainer.frames-per-iter=1000000 \
@@ -202,12 +191,7 @@ if [ $stage -le 5 ]; then
     --trainer.optimization.final-effective-lrate=0.0001 \
     --trainer.optimization.shrink-value=1.0 \
     --trainer.num-chunk-per-minibatch=64,32 \
-    --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
     --cleanup.remove-egs=$remove_egs \
@@ -226,20 +210,20 @@ if [ $stage -le 6 ]; then
   # topology file from the model).  So you could give it a different
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
-
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
-    --frames-per-chunk $frames_per_chunk \
-    --nj $nj --cmd "$cmd" \
-    $dir/graph data/test $dir/decode_test || exit 1;
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
 fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
similarity index 80%
rename from egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh
rename to egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
index 6d8cca876bf..81700ce2180 100755
--- a/egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -1,26 +1,26 @@
 #!/bin/bash
 
 # e2eali_1b is the same as e2eali_1a but uses unconstrained egs
-
-# local/chain/compare_wer.sh /home/hhadian/kaldi-rnnlm/egs/iam/v1/exp/chain/cnn_e2eali_1a exp/chain/cnn_e2eali_1b
-# System                      cnn_e2eali_1a cnn_e2eali_1b
-# WER                             12.79     12.23
-# CER                              5.73      5.48
-# Final train prob              -0.0556   -0.0367
-# Final valid prob              -0.0795   -0.0592
-# Final train prob (xent)       -0.9178   -0.8382
-# Final valid prob (xent)       -1.0604   -0.9853
-# Parameters                      3.95M     3.95M
+# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b
+# System                      cnn_e2eali_1b (dict_50k) cnn_e2eali_1b (dict_50k + unk model)
+# WER                             12.46                    11.20
+# CER                              5.53                     4.76
+# WER val                         12.71                    10.49
+# CER val                          4.97                     3.92
+# Final train prob              -0.0381
+# Final valid prob              -0.0443
+# Final train prob (xent)       -0.7860
+# Final valid prob (xent)       -0.8290
+# Parameters                      3.96M
 
 # steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b
-# exp/chain/cnn_e2eali_1b: num-iters=21 nj=2..4 num-params=4.0M dim=40->360 combine=-0.038->-0.038 (over 1) xent:train/valid[13,20,final]=(-1.34,-0.967,-0.838/-1.40,-1.07,-0.985) logprob:train/valid[13,20,final]=(-0.075,-0.054,-0.037/-0.083,-0.072,-0.059)
+# exp/chain/cnn_e2eali_1b: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.039->-0.039 (over 2) xent:train/valid[27,41,final]=(-1.19,-0.805,-0.786/-1.19,-0.846,-0.829) logprob:train/valid[27,41,final]=(-0.060,-0.041,-0.038/-0.062,-0.048,-0.044)
 
 set -e -o pipefail
-
 stage=0
-
 nj=30
 train_set=train
+decode_val=true
 nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
 affix=_1b  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
 e2echain_model_dir=exp/chain/e2e_cnn_1a
@@ -30,27 +30,17 @@ reporting_email=
 # chain options
 train_stage=-10
 xent_regularize=0.1
-frame_subsampling_factor=4
-# training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
-# training options
-srand=0
-remove_egs=true
-lang_test=lang_unk
+lang_decode=lang_unk
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
-
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -73,7 +63,6 @@ for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
 done
 
-
 if [ $stage -le 1 ]; then
   echo "$0: creating lang directory $lang with chain-type topology"
   # Create a version of the lang/ directory that has one state per phone in the
@@ -103,7 +92,7 @@ if [ $stage -le 2 ]; then
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
                             --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
-                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+                            $train_data_dir data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
 fi
 
@@ -116,20 +105,17 @@ if [ $stage -le 3 ]; then
     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
     exit 1;
   fi
-
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --alignment-subsampling-factor 1 \
     --context-opts "--context-width=2 --central-position=1" \
-    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    --cmd "$cmd" $num_leaves $train_data_dir \
     $lang $ali_dir $tree_dir
 fi
 
-
 if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
-
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
   learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
   cnn_opts="l2-regularize=0.075"
@@ -141,7 +127,6 @@ if [ $stage -le 4 ]; then
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
   input dim=40 name=input
-
   conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
   conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
   conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
@@ -152,11 +137,9 @@ if [ $stage -le 4 ]; then
   relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts
   relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
   relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
-
   ## adding the layers for chain branch
   relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
   output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
-
   # adding the layers for xent branch
   # This block prints the configs for a separate output that will be
   # trained with a cross-entropy objective in the 'chain' mod?els... this
@@ -172,7 +155,6 @@ EOF
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
 fi
 
-
 if [ $stage -le 5 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
@@ -187,11 +169,11 @@ if [ $stage -le 5 ]; then
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
     --chain.lm-opts="--num-extra-lm-states=500" \
-    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.frame-subsampling-factor=4 \
     --chain.alignment-subsampling-factor=1 \
     --chain.left-tolerance 3 \
     --chain.right-tolerance 3 \
-    --trainer.srand=$srand \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=4 \
     --trainer.frames-per-iter=1000000 \
@@ -201,15 +183,10 @@ if [ $stage -le 5 ]; then
     --trainer.optimization.final-effective-lrate=0.0001 \
     --trainer.optimization.shrink-value=1.0 \
     --trainer.num-chunk-per-minibatch=64,32 \
-    --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
-    --cleanup.remove-egs=$remove_egs \
+    --cleanup.remove-egs=true \
     --use-gpu=true \
     --reporting.email="$reporting_email" \
     --feat-dir=$train_data_dir \
@@ -225,20 +202,20 @@ if [ $stage -le 6 ]; then
   # topology file from the model).  So you could give it a different
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
-
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
-    --frames-per-chunk $frames_per_chunk \
-    --nj $nj --cmd "$cmd" \
-    $dir/graph data/test $dir/decode_test || exit 1;
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
 fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh
new file mode 100755
index 00000000000..047d673db17
--- /dev/null
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh
@@ -0,0 +1,224 @@
+#!/bin/bash
+
+# e2eali_1c is the same as e2eali_1b but has more CNN layers, different filter size
+# smaller lm-opts, minibatch, frams-per-iter, less epochs and more initial/finaljobs.
+# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1c
+# System                      cnn_e2eali_1c (dict_50k)        cnn_e2eali_1c(dict_50k + unk_model)
+# WER                             12.10                           9.90
+# CER                              5.23                           4.16
+# WER val                         12.15                           9.60
+# CER val                          4.78                           3.56
+# Final train prob              -0.0470
+# Final valid prob              -0.0657
+# Final train prob (xent)       -0.4713
+# Final valid prob (xent)       -0.5437
+# Parameters                      4.32M
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1c
+# exp/chain/cnn_e2eali_1c: num-iters=30 nj=3..5 num-params=4.3M dim=40->368 combine=-0.051->-0.051 (over 1) xent:train/valid[19,29,final]=(-0.722,-0.500,-0.471/-0.748,-0.568,-0.544) logprob:train/valid[19,29,final]=(-0.090,-0.053,-0.047/-0.106,-0.071,-0.066)
+set -e -o pipefail
+
+stage=0
+nj=30
+train_set=train
+decode_val=true
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1c  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+chunk_width=340,300,200,100
+num_leaves=500
+tdnn_dim=550
+lang_decode=data/lang_unk
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            $train_data_dir data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves $train_data_dir \
+    $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
+  tdnn_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.04"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=true \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --trainer.srand=0 \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=5 \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=true \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
+fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh b/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh
similarity index 66%
rename from egs/iam/v1/local/chain/run_flatstart_cnn1a.sh
rename to egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh
index 56c897137f4..462ad0522de 100755
--- a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh
+++ b/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh
@@ -2,41 +2,36 @@
 # Copyright    2017  Hossein Hadian
 
 # This script does end2end chain training (i.e. from scratch)
-
-# local/chain/compare_wer.sh exp/chain/cnn_1a exp/chain/cnn_chainali_1c exp/chain/e2e_cnn_1a
-# System                         cnn_1a cnn_chainali_1c e2e_cnn_1a
-# WER                             18.52     12.72     13.87
-# CER                             10.07      5.99      6.54
-# Final train prob              -0.0077   -0.0291   -0.0371
-# Final valid prob              -0.0970   -0.0359   -0.0636
-# Final train prob (xent)       -0.5484   -0.9781
-# Final valid prob (xent)       -0.9643   -1.1544
-# Parameters                      4.36M     3.96M     9.13M
+# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+# System                      e2e_cnn_1a (dict_50k)  e2e_cnn_1a (dict_50k + unk_model)
+# WER                             15.21                  14.41
+# CER                              7.43                   6.82
+# WER val                         14.84                  13.51
+# CER val                          6.41                   5.60
+# Final train prob              -0.0206
+# Final valid prob              -0.0393
+# Final train prob (xent)
+# Final valid prob (xent)
+# Parameters                      9.52M
 
 # steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a
-# exp/chain/e2e_cnn_1a: num-iters=21 nj=2..4 num-params=9.1M dim=40->12640 combine=-0.033->-0.033 (over 1) logprob:train/valid[13,20,final]=(-0.058,-0.042,-0.035/-0.070,-0.064,-0.059)
+# exp/chain/e2e_cnn_1a: num-iters=42 nj=2..4 num-params=9.5M dim=40->12640 combine=-0.020->-0.020 (over 1) logprob:train/valid[27,41,final]=(-0.025,-0.021,-0.021/-0.044,-0.040,-0.039)
 
 set -e
-
-# configs for 'chain'
 stage=0
 train_stage=-10
 get_egs_stage=-10
 affix=1a
+nj=30
 
 # training options
 tdnn_dim=450
-num_epochs=4
-num_jobs_initial=2
-num_jobs_final=4
 minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8
 common_egs_dir=
-l2_regularize=0.00005
-frames_per_iter=1000000
-cmvn_opts="--norm-means=true --norm-vars=true"
 train_set=train
-lang_test=lang_unk
-
+decode_val=true
+lang_decode=data/lang_unk
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
@@ -85,45 +80,34 @@ fi
 if [ $stage -le 2 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
   num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
-
-  cnn_opts="l2-regularize=0.075"
-  tdnn_opts="l2-regularize=0.075"
-  output_opts="l2-regularize=0.1"
-  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
-  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
-  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="height-offsets=-2,-1,0,1,2 num-filters-out=70"
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
   input dim=40 name=input
-
   conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
   conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
   conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
-  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
-  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
-  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
-  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
-  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts
-  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
-  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
-
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim
   ## adding the layers for chain branch
   relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
   output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
 EOF
-
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
 fi
 
 if [ $stage -le 3 ]; then
   # no need to store the egs in a shared storage because we always
   # remove them. Anyway, it takes only 5 minutes to generate them.
-
   steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
     --cmd "$cmd" \
-    --feat.cmvn-opts "$cmvn_opts" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
     --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize $l2_regularize \
+    --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --egs.dir "$common_egs_dir" \
     --egs.stage $get_egs_stage \
@@ -131,17 +115,17 @@ if [ $stage -le 3 ]; then
     --chain.frame-subsampling-factor 4 \
     --chain.alignment-subsampling-factor 4 \
     --trainer.num-chunk-per-minibatch $minibatch_size \
-    --trainer.frames-per-iter $frames_per_iter \
-    --trainer.num-epochs $num_epochs \
+    --trainer.frames-per-iter 1000000 \
+    --trainer.num-epochs 4 \
     --trainer.optimization.momentum 0 \
-    --trainer.optimization.num-jobs-initial $num_jobs_initial \
-    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 4 \
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.shrink-value 1.0 \
     --trainer.max-param-change 2.0 \
     --cleanup.remove-egs true \
-    --feat-dir data/${train_set} \
+    --feat-dir data/$train_set \
     --tree-dir $treedir \
     --dir $dir  || exit 1;
 fi
@@ -153,18 +137,18 @@ if [ $stage -le 4 ]; then
   # topology file from the model).  So you could give it a different
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
-
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 5 ]; then
-  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --nj 30 --cmd "$cmd" \
-    $dir/graph data/test $dir/decode_test || exit 1;
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
 fi
 
-echo "Done. Date: $(date). Results:"
+echo "$0 Done. Date: $(date). Results:"
 local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/extract_features.sh b/egs/iam/v1/local/extract_features.sh
new file mode 100755
index 00000000000..1741ad3f9b2
--- /dev/null
+++ b/egs/iam/v1/local/extract_features.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright   2017 Yiwen Shao
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script runs the make features script in parallel. 
+
+nj=4
+cmd=run.pl
+feat_dim=40
+augment=false
+fliplr=false
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+data=$1
+featdir=$data/data
+scp=$data/images.scp
+logdir=$data/log
+
+mkdir -p $logdir
+mkdir -p $featdir
+
+# make $featdir an absolute pathname
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/images.$n.scp"
+done
+
+# split images.scp
+utils/split_scp.pl $scp $split_scps || exit 1;
+
+$cmd JOB=1:$nj $logdir/extract_features.JOB.log \
+  local/make_features.py $logdir/images.JOB.scp \
+    --allowed_len_file_path $data/allowed_lengths.txt \
+    --feat-dim $feat_dim --fliplr $fliplr --augment $augment \| \
+    copy-feats --compress=true --compression-method=7 \
+    ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
+
+## aggregates the output scp's to get feats.scp
+for n in $(seq $nj); do
+  cat $featdir/images.$n.scp || exit 1;
+done > $data/feats.scp || exit 1
diff --git a/egs/iam/v1/local/gen_topo.py b/egs/iam/v1/local/gen_topo.py
new file mode 100755
index 00000000000..6fae276d542
--- /dev/null
+++ b/egs/iam/v1/local/gen_topo.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+
+# Copyright 2017 (author: Chun-Chieh Chang)
+
+# Generate a topology file.  This allows control of the number of states in the
+# non-silence HMMs, and in the silence HMMs. This is a modified version of
+# 'utils/gen_topo.pl'. The difference is that this creates two topologies for
+# the non-silence HMMs. The number of states for punctuations is different than
+# the number of states for other characters.
+
+from __future__ import print_function
+import argparse
+import string
+
+parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
+                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
+                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
+                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
+parser.add_argument("num_nonsil_states", type=int, help="number of states for nonsilence phones");
+parser.add_argument("num_sil_states", type=int, help="number of states for silence phones");
+parser.add_argument("num_punctuation_states", type=int, help="number of states for punctuation");
+parser.add_argument("nonsilence_phones", type=str,
+                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
+parser.add_argument("silence_phones", type=str,
+                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
+parser.add_argument("phone_list", type=str, help="file containing all phones and their corresponding number.");
+
+args = parser.parse_args()
+
+silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
+nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
+all_phones = silence_phones +  nonsilence_phones
+
+punctuation_phones = []
+exclude = set("!(),.?;:'-\"")
+with open(args.phone_list) as f:
+    for line in f:
+        line = line.strip()
+        phone = line.split('_')[0]
+        if len(phone) == 1 and phone in exclude:
+            punctuation_phones.append(int(line.split(' ')[1]))
+# For nonsilence phones that are not punctuations
+print("<Topology>")
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in nonsilence_phones if x not in punctuation_phones]))
+print("</ForPhones>")
+for x in range(0, args.num_nonsil_states):
+    xp1 = x + 1
+    print("<State> " + str(x) + " <PdfClass> " + str(x) + " <Transition> " + str(x) + " 0.75 <Transition> " + str(xp1) + " 0.25 </State>")
+print("<State> " + str(args.num_nonsil_states) + " </State>")
+print("</TopologyEntry>")
+
+# For nonsilence phones that ar punctuations
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in nonsilence_phones if x in punctuation_phones]))
+print("</ForPhones>")
+for x in range(0, args.num_punctuation_states):
+    xp1 = x + 1
+    print("<State> " + str(x) + " <PdfClass> " + str(x) + " <Transition> " + str(x) + " 0.75 <Transition> " + str(xp1) + " 0.25 </State>")
+print("<State> " + str(args.num_punctuation_states) + " </State>")
+print("</TopologyEntry>")
+
+# For silence phones
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in silence_phones]))
+print("</ForPhones>")
+if(args.num_sil_states > 1):
+    transp = 1.0 / (args.num_sil_states - 1)
+    
+    state_str = "<State> 0 <PdfClass> 0 "
+    for x in range(0, (args.num_sil_states - 1)):
+        state_str = state_str + "<Transition> " + str(x) + " " + str(transp) + " "
+    state_str = state_str + "</State>"
+    print(state_str)
+
+    for x in range(1, (args.num_sil_states - 1)):
+        state_str = "<State> " + str(x) + " <PdfClass> " + str(x) + " "
+        for y in range(1, args.num_sil_states):
+            state_str = state_str + "<Transition> " + str(y) + " " + str(transp) + " "
+        state_str = state_str + "</State>"
+        print(state_str)
+    second_last = args.num_sil_states - 1
+    print("<State> " + str(second_last) + " <PdfClass> " + str(second_last) + " <Transition> " + str(second_last) + " 0.75 <Transition> " + str(args.num_sil_states) + " 0.25 </State>")
+    print("<State> " + str(args.num_sil_states) + " </State>")
+else:
+    print("<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>")
+    print("<State> " + str(args.num_sil_states) + " </State>")
+print("</TopologyEntry>")
+print("</Topology>")
diff --git a/egs/iam/v1/local/make_features.py b/egs/iam/v1/local/make_features.py
index 84e012daedb..3ce501732cf 100755
--- a/egs/iam/v1/local/make_features.py
+++ b/egs/iam/v1/local/make_features.py
@@ -2,6 +2,7 @@
 
 # Copyright      2017  Chun Chieh Chang
 #                2017  Ashish Arora
+#                2017  Yiwen Shao
 #                2018  Hossein Hadian
 
 """ This script converts images to Kaldi-format feature matrices. The input to
@@ -14,20 +15,27 @@
     to enforce the images to have the specified length in that file by padding
     white pixels (the --padding option will be ignored in this case). This relates
     to end2end chain training.
-
     eg. local/make_features.py data/train --feat-dim 40
 """
-
+import random
 import argparse
 import os
 import sys
+import scipy.io as sio
 import numpy as np
 from scipy import misc
+from scipy.ndimage.interpolation import affine_transform
+import math
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
 
 parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and
                                                 writes them to standard output in text format.""")
-parser.add_argument('dir', type=str,
-                    help='Source data directory (containing images.scp)')
+parser.add_argument('images_scp_path', type=str,
+                    help='Path of images.scp file')
+parser.add_argument('--allowed_len_file_path', type=str, default=None,
+                    help='If supplied, each images will be padded to reach the '
+                    'target length (this overrides --padding).')
 parser.add_argument('--out-ark', type=str, default='-',
                     help='Where to write the output feature file')
 parser.add_argument('--feat-dim', type=int, default=40,
@@ -35,8 +43,10 @@
 parser.add_argument('--padding', type=int, default=5,
                     help='Number of white pixels to pad on the left'
                     'and right side of the image.')
-
-
+parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False,
+                   help="Flip the image left-right for right to left languages")
+parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False,
+                   help="performs image augmentation")
 args = parser.parse_args()
 
 
@@ -56,18 +66,12 @@ def write_kaldi_matrix(file_handle, matrix, key):
             file_handle.write("\n")
     file_handle.write(" ]\n")
 
-def get_scaled_image(im, allowed_lengths = None):
-    scale_size = args.feat_dim
-    sx = im.shape[1]
-    sy = im.shape[0]
-    scale = (1.0 * scale_size) / sy
-    nx = int(scale_size)
-    ny = int(scale * sx)
-    im = misc.imresize(im, (nx, ny))
+
+def horizontal_pad(im, allowed_lengths = None):
     if allowed_lengths is None:
         left_padding = right_padding = args.padding
     else:  # Find an allowed length for the image
-        imlen = im.shape[1]
+        imlen = im.shape[1] # width
         allowed_len = 0
         for l in allowed_lengths:
             if l > imlen:
@@ -77,28 +81,153 @@ def get_scaled_image(im, allowed_lengths = None):
             #  No allowed length was found for the image (the image is too long)
             return None
         padding = allowed_len - imlen
-        left_padding = padding // 2
+        left_padding = int(padding // 2)
         right_padding = padding - left_padding
-    dim_y = im.shape[0]
+    dim_y = im.shape[0] # height
     im_pad = np.concatenate((255 * np.ones((dim_y, left_padding),
                                            dtype=int), im), axis=1)
     im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding),
                                                     dtype=int)), axis=1)
     return im_pad1
 
-### main ###
-data_list_path = os.path.join(args.dir, 'images.scp')
+def get_scaled_image_aug(im, mode='normal'):
+    scale_size = args.feat_dim
+    sx = im.shape[1]
+    sy = im.shape[0]
+    scale = (1.0 * scale_size) / sy
+    nx = int(scale_size)
+    ny = int(scale * sx) 
+    scale_size = random.randint(10, 30)
+    scale = (1.0 * scale_size) / sy
+    down_nx = int(scale_size)
+    down_ny = int(scale * sx)
+    if mode == 'normal':
+        im = misc.imresize(im, (nx, ny))
+        return im
+    else:
+        im_scaled_down = misc.imresize(im, (down_nx, down_ny))
+        im_scaled_up = misc.imresize(im_scaled_down, (nx, ny))
+        return im_scaled_up
+    return im
+
+def contrast_normalization(im, low_pct, high_pct):
+    element_number = im.size
+    rows = im.shape[0]
+    cols = im.shape[1]
+    im_contrast = np.zeros(shape=im.shape)
+    low_index = int(low_pct * element_number)
+    high_index = int(high_pct * element_number)
+    sorted_im = np.sort(im, axis=None)
+    low_thred = sorted_im[low_index]
+    high_thred = sorted_im[high_index]
+    for i in range(rows):
+        for j in range(cols):
+            if im[i, j] > high_thred:
+                im_contrast[i, j] = 255  # lightest to white
+            elif im[i, j] < low_thred:
+                im_contrast[i, j] = 0  # darkest to black
+            else:
+                # linear normalization
+                im_contrast[i, j] = (im[i, j] - low_thred) * \
+                    255 / (high_thred - low_thred)
+    return im_contrast
+
+
+def geometric_moment(frame, p, q):
+    m = 0
+    for i in range(frame.shape[1]):
+        for j in range(frame.shape[0]):
+            m += (i ** p) * (j ** q) * frame[i][i]
+    return m
+
+
+def central_moment(frame, p, q):
+    u = 0
+    x_bar = geometric_moment(frame, 1, 0) / \
+        geometric_moment(frame, 0, 0)  # m10/m00
+    y_bar = geometric_moment(frame, 0, 1) / \
+        geometric_moment(frame, 0, 0)  # m01/m00
+    for i in range(frame.shape[1]):
+        for j in range(frame.shape[0]):
+            u += ((i - x_bar)**p) * ((j - y_bar)**q) * frame[i][j]
+    return u
+
+
+def height_normalization(frame, w, h):
+    frame_normalized = np.zeros(shape=(h, w))
+    alpha = 4
+    x_bar = geometric_moment(frame, 1, 0) / \
+        geometric_moment(frame, 0, 0)  # m10/m00
+    y_bar = geometric_moment(frame, 0, 1) / \
+        geometric_moment(frame, 0, 0)  # m01/m00
+    sigma_x = (alpha * ((central_moment(frame, 2, 0) /
+                         geometric_moment(frame, 0, 0)) ** .5))  # alpha * sqrt(u20/m00)
+    sigma_y = (alpha * ((central_moment(frame, 0, 2) /
+                         geometric_moment(frame, 0, 0)) ** .5))  # alpha * sqrt(u02/m00)
+    for x in range(w):
+        for y in range(h):
+            i = int((x / w - 0.5) * sigma_x + x_bar)
+            j = int((y / h - 0.5) * sigma_y + y_bar)
+            frame_normalized[x][y] = frame[i][j]
+    return frame_normalized
 
+
+def find_slant_project(im):
+    rows = im.shape[0]
+    cols = im.shape[1]
+    std_max = 0
+    alpha_max = 0
+    col_disp = np.zeros(90, int)
+    proj = np.zeros(shape=(90, cols + 2 * rows), dtype=int)
+    for r in range(rows):
+        for alpha in range(-45, 45, 1):
+            col_disp[alpha] = int(r * math.tan(alpha / 180.0 * math.pi))
+        for c in range(cols):
+            if im[r, c] < 100:
+                for alpha in range(-45, 45, 1):
+                    proj[alpha + 45, c + col_disp[alpha] + rows] += 1
+    for alpha in range(-45, 45, 1):
+        proj_histogram, bin_array = np.histogram(proj[alpha + 45, :], bins=10)
+        proj_std = np.std(proj_histogram)
+        if proj_std > std_max:
+            std_max = proj_std
+            alpha_max = alpha
+    proj_std = np.std(proj, axis=1)
+    return -alpha_max
+
+
+def horizontal_shear(im, degree):
+    rad = degree / 180.0 * math.pi
+    padding_x = int(abs(np.tan(rad)) * im.shape[0])
+    padding_y = im.shape[0]
+    if rad > 0:
+        im_pad = np.concatenate(
+            (255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1)
+    elif rad < 0:
+        im_pad = np.concatenate(
+            (im, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1)
+    else:
+        im_pad = im
+    shear_matrix = np.array([[1, 0],
+                             [np.tan(rad), 1]])
+    sheared_im = affine_transform(im_pad, shear_matrix, cval=255.0)
+    return sheared_im
+
+
+### main ###
+random.seed(1)
+data_list_path = args.images_scp_path
 if args.out_ark == '-':
     out_fh = sys.stdout
 else:
-    out_fh = open(args.out_ark,'wb')
+    out_fh = open(args.out_ark,'w')
 
 allowed_lengths = None
-if os.path.isfile(os.path.join(args.dir, 'allowed_lengths.txt')):
+allowed_len_handle = args.allowed_len_file_path
+if os.path.isfile(allowed_len_handle):
     print("Found 'allowed_lengths.txt' file...", file=sys.stderr)
     allowed_lengths = []
-    with open(os.path.join(args.dir,'allowed_lengths.txt')) as f:
+    with open(allowed_len_handle) as f:
         for line in f:
             allowed_lengths.append(int(line.strip()))
     print("Read {} allowed lengths and will apply them to the "
@@ -106,6 +235,7 @@ def get_scaled_image(im, allowed_lengths = None):
 
 num_fail = 0
 num_ok = 0
+aug_setting = ['normal', 'scaled']
 with open(data_list_path) as f:
     for line in f:
         line = line.strip()
@@ -113,15 +243,24 @@ def get_scaled_image(im, allowed_lengths = None):
         image_id = line_vect[0]
         image_path = line_vect[1]
         im = misc.imread(image_path)
-        im_scaled = get_scaled_image(im, allowed_lengths)
-
-        if im_scaled is None:
+        if args.fliplr:
+            im = np.fliplr(im)
+        if args.augment:
+            im_aug = get_scaled_image_aug(im, aug_setting[0])
+            im_contrast = contrast_normalization(im_aug, 0.05, 0.2)
+            slant_degree = find_slant_project(im_contrast)
+            im_sheared = horizontal_shear(im_contrast, slant_degree)
+            im_aug = im_sheared
+        else:
+            im_aug = get_scaled_image_aug(im, aug_setting[0])
+        im_horizontal_padded = horizontal_pad(im_aug, allowed_lengths)
+        if im_horizontal_padded is None:
             num_fail += 1
             continue
-        data = np.transpose(im_scaled, (1, 0))
+        data = np.transpose(im_horizontal_padded, (1, 0))
         data = np.divide(data, 255.0)
         num_ok += 1
         write_kaldi_matrix(out_fh, data, image_id)
 
-print('Generated features for {} images. Failed for {} (iamge too '
+print('Generated features for {} images. Failed for {} (image too '
       'long).'.format(num_ok, num_fail), file=sys.stderr)
diff --git a/egs/iam/v1/local/prepare_data.sh b/egs/iam/v1/local/prepare_data.sh
index 73d711c73f0..dc07f07e318 100755
--- a/egs/iam/v1/local/prepare_data.sh
+++ b/egs/iam/v1/local/prepare_data.sh
@@ -18,6 +18,7 @@
 
 stage=0
 download_dir=data/download
+process_aachen_split=false
 wellington_dir=
 username=
 password=       # username and password for downloading the IAM database
@@ -53,6 +54,8 @@ ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz
 brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt
 lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip
 wellington_corpus_loc=/export/corpora5/Wellington/WWC/
+aachen_split_url=http://www.openslr.org/resources/56/splits.zip
+aachen_splits=data/local/aachensplits
 mkdir -p $download_dir data/local
 
 # download and extact images and transcription
@@ -144,6 +147,18 @@ else
   echo "$0: Wellington Corpus not included because wellington_dir not provided"
 fi
 
+if [ -d $aachen_splits ]; then
+  echo "$0: Not downloading the Aachen splits as it is already there."
+else
+  if [ ! -f $aachen_splits/splits.zip ]; then
+    echo "$0: Downloading Aachen splits ..."
+    mkdir -p $aachen_splits
+    wget -P $aachen_splits/ $aachen_split_url || exit 1;
+  fi
+  unzip $aachen_splits/splits.zip -d $aachen_splits || exit 1;
+  echo "$0: Done downloading and extracting Aachen splits"
+fi
+
 mkdir -p data/{train,test,val}
 file_name=largeWriterIndependentTextLineRecognitionTask
 
@@ -160,11 +175,17 @@ cat $train_old > $train_new
 cat $test_old > $test_new
 cat $val1_old $val2_old > $val_new
 
-if [ $stage -le 0 ]; then
-  local/process_data.py data/local data/train --dataset train || exit 1
-  local/process_data.py data/local data/test --dataset test || exit 1
-  local/process_data.py data/local data/val --dataset validation || exit 1
-
-  utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
-  utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
+if $process_aachen_split; then
+    local/process_aachen_splits.py data/local $aachen_splits/splits data/train --dataset train || exit 1
+    local/process_aachen_splits.py data/local $aachen_splits/splits data/test --dataset test || exit 1
+    local/process_aachen_splits.py data/local $aachen_splits/splits data/val --dataset validation || exit 1
+else
+    local/process_data.py data/local data/train --dataset train || exit 1
+    local/process_data.py data/local data/test --dataset test || exit 1
+    local/process_data.py data/local data/val --dataset validation || exit 1
 fi
+
+image/fix_data_dir.sh data/train
+image/fix_data_dir.sh data/test
+image/fix_data_dir.sh data/val
+
diff --git a/egs/iam/v1/local/prepare_dict.sh b/egs/iam/v1/local/prepare_dict.sh
index f691d577fba..7451f6b85f7 100755
--- a/egs/iam/v1/local/prepare_dict.sh
+++ b/egs/iam/v1/local/prepare_dict.sh
@@ -38,7 +38,7 @@ while(<>){
 }' | sort -u > $dir/lexicon.txt
 
 
-sed -i "s/#/<HASH>/" $dir/nonsilence_phones.txt
+perl -i -pe "s/#/<HASH>/" $dir/nonsilence_phones.txt
 
 echo '<sil> SIL' >> $dir/lexicon.txt
 echo '<unk> SIL' >> $dir/lexicon.txt
diff --git a/egs/iam/v1/local/process_aachen_splits.py b/egs/iam/v1/local/process_aachen_splits.py
new file mode 100755
index 00000000000..cb6a6d4f0d8
--- /dev/null
+++ b/egs/iam/v1/local/process_aachen_splits.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+""" This script reads the extracted IAM database files and creates
+    the following files (for the data subset selected via --dataset):
+    text, utt2spk, images.scp.
+
+  Eg. local/process_aachen_splits.py data/local data/train data --dataset train
+  Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from
+      utt2spk file: 000_a01-000u-00 000
+      images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png
+"""
+
+import argparse
+import os
+import sys
+import xml.dom.minidom as minidom
+
+parser = argparse.ArgumentParser(description="""Creates text, utt2spk
+                                                and images.scp files.""")
+parser.add_argument('database_path', type=str,
+                    help='Path to the downloaded (and extracted) IAM data')
+parser.add_argument('split_path', type=str,
+                    help='location of the train/test/val set')
+parser.add_argument('out_dir', type=str,
+                    help='location to write output files.')
+parser.add_argument('--dataset', type=str, default='train',
+                    choices=['train', 'test','validation'],
+                    help='Subset of data to process.')
+args = parser.parse_args()
+
+text_file = os.path.join(args.out_dir + '/', 'text')
+text_fh = open(text_file, 'w')
+
+utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w')
+
+image_file = os.path.join(args.out_dir + '/', 'images.scp')
+image_fh = open(image_file, 'w')
+
+dataset_path = os.path.join(args.split_path,
+                            args.dataset + '.uttlist')
+
+text_file_path = os.path.join(args.database_path,
+                              'ascii','lines.txt')
+text_dict = {}
+def process_text_file_for_word_model():
+  with open (text_file_path, 'rt') as in_file:
+    for line in in_file:
+      if line[0]=='#':
+        continue
+      line = line.strip()
+      utt_id = line.split(' ')[0]
+      text_vect = line.split(' ')[8:]
+      text = "".join(text_vect)
+      text = text.replace("|", " ")
+      text_dict[utt_id] = text
+
+
+### main ###
+
+print("Processing '{}' data...".format(args.dataset))
+process_text_file_for_word_model()
+
+with open(dataset_path) as f:
+  for line in f:
+    line = line.strip()
+    line_vect = line.split('-')
+    xml_file = line_vect[0] + '-' + line_vect[1]
+    xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml')
+    doc = minidom.parse(xml_path)
+    form_elements = doc.getElementsByTagName('form')[0]
+    writer_id = form_elements.getAttribute('writer-id')
+    outerfolder = form_elements.getAttribute('id')[0:3]
+    innerfolder = form_elements.getAttribute('id')
+    lines_path = os.path.join(args.database_path, 'lines',
+                              outerfolder, innerfolder)
+    for file in os.listdir(lines_path):
+      if file.endswith(".png"):
+        image_file_path = os.path.join(lines_path, file)
+        base_name = os.path.splitext(os.path.basename(image_file_path))[0]
+        text =  text_dict[base_name]
+        utt_id = writer_id + '_' + base_name
+        text_fh.write(utt_id + ' ' + text + '\n')
+        utt2spk_fh.write(utt_id + ' ' + writer_id + '\n')
+        image_fh.write(utt_id + ' ' + image_file_path + '\n')
diff --git a/egs/iam/v1/local/train_lm.sh b/egs/iam/v1/local/train_lm.sh
index a15fbea2af3..3e8c838efdb 100755
--- a/egs/iam/v1/local/train_lm.sh
+++ b/egs/iam/v1/local/train_lm.sh
@@ -58,9 +58,12 @@ if [ $stage -le 0 ]; then
   rm ${dir}/data/text/* 2>/dev/null || true
 
   # Using LOB and brown corpus.
-  cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \
-    local/remove_test_utterances_from_lob.py data/test/text data/val/text \
-                                             > ${dir}/data/text/lob.txt
+  if [ ! -f data/local/lob-train-only.txt ]; then
+    cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \
+      local/remove_test_utterances_from_lob.py data/test/text data/val/text \
+                                               > data/local/lob-train-only.txt
+  fi
+  cat data/local/lob-train-only.txt > ${dir}/data/text/lob.txt
   cat data/local/browncorpus/brown.txt > ${dir}/data/text/brown.txt
   if [ -d "data/local/wellingtoncorpus" ]; then
     cat data/local/wellingtoncorpus/Wellington_annotation_removed.txt > ${dir}/data/text/wellington.txt
diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh
index b943870f530..85811b6cb3d 100755
--- a/egs/iam/v1/run.sh
+++ b/egs/iam/v1/run.sh
@@ -20,6 +20,9 @@ iam_database=/export/corpora5/handwriting_ocr/IAM
 # This corpus is of written NZ English that can be purchased here:
 # "https://www.victoria.ac.nz/lals/resources/corpora-default"
 wellington_database=/export/corpora5/Wellington/WWC/
+train_set=train_aug
+process_aachen_split=false
+overwrite=false
 
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
@@ -30,39 +33,63 @@ wellington_database=/export/corpora5/Wellington/WWC/
 ./local/check_tools.sh
 
 if [ $stage -le 0 ]; then
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+
   echo "$0: Preparing data..."
   local/prepare_data.sh --download-dir "$iam_database" \
     --wellington-dir "$wellington_database" \
-    --username "$username" --password "$password"
+    --username "$username" --password "$password" \
+    --process_aachen_split $process_aachen_split
 fi
-mkdir -p data/{train,test}/data
+mkdir -p data/{train,test,val}/data
 
 if [ $stage -le 1 ]; then
-  echo "$0: Preparing the test and train feature files..."
-  for dataset in train test; do
-    local/make_features.py data/$dataset --feat-dim 40 | \
-      copy-feats --compress=true --compression-method=7 \
-                 ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp
-    steps/compute_cmvn_stats.sh data/$dataset
+  echo "$0: $(date) stage 1: getting allowed image widths for e2e training..."
+  image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command
+  # The next command creates a "allowed_lengths.txt" file in data/train
+  # which will be used by local/make_features.py to enforce the images to
+  # have allowed lengths. The allowed lengths will be spaced by 10% difference in length.
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
+  echo "$0: $(date) Extracting features, creating feats.scp file"
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train
+  steps/compute_cmvn_stats.sh data/train || exit 1;
+  for set in val test; do
+    local/extract_features.sh --nj $nj --cmd "$cmd" --augment true \
+    --feat-dim 40 data/${set}
+    steps/compute_cmvn_stats.sh data/${set} || exit 1;
   done
+  utils/fix_data_dir.sh data/train
 fi
 
 if [ $stage -le 2 ]; then
+  for set in train; do
+    echo "$0: $(date) stage 2: Performing augmentation, it will double training data"
+    local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data
+    steps/compute_cmvn_stats.sh data/${set}_aug || exit 1;
+  done
+fi
+
+if [ $stage -le 3 ]; then
   echo "$0: Estimating a language model for decoding..."
   # We do this stage before dict preparation because prepare_dict.sh
   # generates the lexicon from pocolm's wordlist
   local/train_lm.sh --vocab-size 50k
 fi
 
-if [ $stage -le 3 ]; then
+if [ $stage -le 4 ]; then
   echo "$0: Preparing dictionary and lang..."
-
   # This is for training. Use a large vocab size, e.g. 500k to include all the
   # training words:
   local/prepare_dict.sh --vocab-size 500k --dir data/local/dict  # this is for training
   utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \
                         data/local/dict "<unk>" data/lang/temp data/lang
-
+  silphonelist=`cat data/lang/phones/silence.csl`
+  nonsilphonelist=`cat data/lang/phones/nonsilence.csl`
+  local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo
   # This is for decoding. We use a 50k lexicon to be consistent with the papers
   # reporting WERs on IAM:
   local/prepare_dict.sh --vocab-size 50k --dir data/local/dict_50k  # this is for decoding
@@ -77,11 +104,14 @@ if [ $stage -le 3 ]; then
   utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 \
                         --unk-fst exp/unk_lang_model/unk_fst.txt \
                         data/local/dict_50k "<unk>" data/lang_unk/temp data/lang_unk
+  silphonelist=`cat data/lang/phones/silence.csl`
+  nonsilphonelist=`cat data/lang/phones/nonsilence.csl`
+  local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang_unk/phones.txt >data/lang_unk/topo
   cp data/lang_test/G.fst data/lang_unk/G.fst
 fi
 
 if [ $stage -le 4 ]; then
-  steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train \
+  steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/$train_set \
     data/lang exp/mono
 fi
 
@@ -93,10 +123,10 @@ if [ $stage -le 5 ] && $decode_gmm; then
 fi
 
 if [ $stage -le 6 ]; then
-  steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \
+  steps/align_si.sh --nj $nj --cmd $cmd data/$train_set data/lang \
     exp/mono exp/mono_ali
 
-  steps/train_deltas.sh --cmd $cmd 500 20000 data/train data/lang \
+  steps/train_deltas.sh --cmd $cmd 500 20000 data/$train_set data/lang \
     exp/mono_ali exp/tri
 fi
 
@@ -108,12 +138,12 @@ if [ $stage -le 7 ] && $decode_gmm; then
 fi
 
 if [ $stage -le 8 ]; then
-  steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \
+  steps/align_si.sh --nj $nj --cmd $cmd data/$train_set data/lang \
     exp/tri exp/tri_ali
 
   steps/train_lda_mllt.sh --cmd $cmd \
     --splice-opts "--left-context=3 --right-context=3" 500 20000 \
-    data/train data/lang exp/tri_ali exp/tri2
+    data/$train_set data/lang exp/tri_ali exp/tri2
 fi
 
 if [ $stage -le 9 ] && $decode_gmm; then
@@ -125,10 +155,10 @@ fi
 
 if [ $stage -le 10 ]; then
   steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \
-    data/train data/lang exp/tri2 exp/tri2_ali
+    data/$train_set data/lang exp/tri2 exp/tri2_ali
 
   steps/train_sat.sh --cmd $cmd 500 20000 \
-    data/train data/lang exp/tri2_ali exp/tri3
+    data/$train_set data/lang exp/tri2_ali exp/tri3
 fi
 
 if [ $stage -le 11 ] && $decode_gmm; then
@@ -140,13 +170,13 @@ fi
 
 if [ $stage -le 12 ]; then
   steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \
-    data/train data/lang exp/tri3 exp/tri3_ali
+    data/$train_set data/lang exp/tri3 exp/tri3_ali
 fi
 
 if [ $stage -le 13 ]; then
-  local/chain/run_cnn_1a.sh --lang-test lang_unk
+  local/chain/run_cnn.sh --lang-test lang_unk --train_set $train_set
 fi
 
 if [ $stage -le 14 ]; then
-  local/chain/run_cnn_chainali_1c.sh --chain-model-dir exp/chain/cnn_1a --stage 2
+  local/chain/run_cnn_chainali.sh --chain-model-dir exp/chain/cnn_1a --stage 2 --train_set $train_set
 fi
diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh
index 6df93e739f4..0a8b014715f 100755
--- a/egs/iam/v1/run_end2end.sh
+++ b/egs/iam/v1/run_end2end.sh
@@ -6,6 +6,8 @@ stage=0
 nj=20
 username=
 password=
+process_aachen_split=false
+overwrite=false
 # iam_database points to the database path on the JHU grid. If you have not
 # already downloaded the database you can set it to a local directory
 # like "data/download" and follow the instructions
@@ -16,61 +18,78 @@ iam_database=/export/corpora5/handwriting_ocr/IAM
 # This corpus is of written NZ English that can be purchased here:
 # "https://www.victoria.ac.nz/lals/resources/corpora-default"
 wellington_database=/export/corpora5/Wellington/WWC/
-
+train_set=train_aug
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
 . ./path.sh
 . ./utils/parse_options.sh  # e.g. this parses the above options
                             # if supplied.
-
-
 ./local/check_tools.sh
-
 if [ $stage -le 0 ]; then
+
+ if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+
   echo "$0: Preparing data..."
   local/prepare_data.sh --download-dir "$iam_database" \
     --wellington-dir "$wellington_database" \
-    --username "$username" --password "$password"
+    --username "$username" --password "$password" \
+    --process_aachen_split $process_aachen_split
 fi
-mkdir -p data/{train,test}/data
+mkdir -p data/{train,test,val}/data
 
 if [ $stage -le 1 ]; then
-  image/get_image2num_frames.py data/train  # This will be needed for the next command
+  echo "$0: $(date) stage 1: getting allowed image widths for e2e training..."
+  image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command
   # The next command creates a "allowed_lengths.txt" file in data/train
   # which will be used by local/make_features.py to enforce the images to
   # have allowed lengths. The allowed lengths will be spaced by 10% difference in length.
   image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
-  echo "$0: Preparing the test and train feature files..."
-  for dataset in train test; do
-    local/make_features.py data/$dataset --feat-dim 40 | \
-      copy-feats --compress=true --compression-method=7 \
-                 ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp
-    steps/compute_cmvn_stats.sh data/$dataset
+  echo "$0: $(date) Extracting features, creating feats.scp file"
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train
+  steps/compute_cmvn_stats.sh data/train || exit 1;
+  for set in val test; do
+    local/extract_features.sh --nj $nj --cmd "$cmd" --augment true \
+    --feat-dim 40 data/${set}
+    steps/compute_cmvn_stats.sh data/${set} || exit 1;
   done
   utils/fix_data_dir.sh data/train
 fi
 
 if [ $stage -le 2 ]; then
+  for set in train; do
+    echo "$0: $(date) stage 2: Performing augmentation, it will double training data"
+    local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data
+    steps/compute_cmvn_stats.sh data/${set}_aug || exit 1;
+  done
+fi
+
+if [ $stage -le 3 ]; then
   echo "$0: Estimating a language model for decoding..."
   # We do this stage before dict preparation because prepare_dict.sh
   # generates the lexicon from pocolm's wordlist
   local/train_lm.sh --vocab-size 50k
 fi
 
-if [ $stage -le 3 ]; then
+if [ $stage -le 4 ]; then
   echo "$0: Preparing dictionary and lang..."
-
   # This is for training. Use a large vocab size, e.g. 500k to include all the
   # training words:
   local/prepare_dict.sh --vocab-size 500k --dir data/local/dict
-  utils/prepare_lang.sh --sil-prob 0.95 \
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \
                         data/local/dict "<unk>" data/lang/temp data/lang
+  silphonelist=`cat data/lang/phones/silence.csl`
+  nonsilphonelist=`cat data/lang/phones/nonsilence.csl`
+  local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo
 
   # This is for decoding. We use a 50k lexicon to be consistent with the papers
   # reporting WERs on IAM.
   local/prepare_dict.sh --vocab-size 50k --dir data/local/dict_50k
-  utils/prepare_lang.sh --sil-prob 0.95 data/local/dict_50k \
-                        "<unk>" data/lang_test/temp data/lang_test
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \
+                        data/local/dict_50k "<unk>" data/lang_test/temp data/lang_test
   utils/format_lm.sh data/lang_test data/local/local_lm/data/arpa/3gram_big.arpa.gz \
                      data/local/dict_50k/lexicon.txt data/lang_test
 
@@ -79,23 +98,27 @@ if [ $stage -le 3 ]; then
                             data/local/dict_50k exp/unk_lang_model
   utils/prepare_lang.sh --unk-fst exp/unk_lang_model/unk_fst.txt \
                         data/local/dict_50k "<unk>" data/lang_unk/temp data/lang_unk
+
+  silphonelist=`cat data/lang/phones/silence.csl`
+  nonsilphonelist=`cat data/lang/phones/nonsilence.csl`
+  local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang_unk/phones.txt >data/lang_unk/topo
   cp data/lang_test/G.fst data/lang_unk/G.fst
 fi
 
-if [ $stage -le 4 ]; then
+if [ $stage -le 5 ]; then
   echo "$0: Calling the flat-start chain recipe..."
-  local/chain/run_flatstart_cnn1a.sh
+  local/chain/run_e2e_cnn.sh --train_set $train_set
 fi
 
-if [ $stage -le 5 ]; then
+if [ $stage -le 6 ]; then
   echo "$0: Aligning the training data using the e2e chain model..."
   steps/nnet3/align.sh --nj 50 --cmd "$cmd" \
                        --use-gpu false \
                        --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
-                       data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
+                       data/$train_set data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
-if [ $stage -le 6 ]; then
+if [ $stage -le 7 ]; then
   echo "$0: Building a tree and training a regular chain model using the e2e alignments..."
-  local/chain/run_cnn_e2eali_1a.sh
+  local/chain/run_cnn_e2eali.sh --train_set $train_set
 fi
diff --git a/egs/iam/v2/local/prepare_dict.sh b/egs/iam/v2/local/prepare_dict.sh
index e21a59c7e92..714b5b51788 100755
--- a/egs/iam/v2/local/prepare_dict.sh
+++ b/egs/iam/v2/local/prepare_dict.sh
@@ -39,7 +39,7 @@ while(<>){
 }' | sort -u > $dir/lexicon.txt
 
 
-sed -i "s/#/<HASH>/" $dir/nonsilence_phones.txt
+perl -i -pe "s/#/<HASH>/" $dir/nonsilence_phones.txt
 
 echo '<sil> SIL' >> $dir/lexicon.txt
 
diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
index eb140e900e1..d449805be1d 100755
--- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
@@ -24,22 +24,17 @@ xent_regularize=0.1
 # training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
 tdnn_dim=450
-# training options
-srand=0
 remove_egs=false
 lang_decode=data/lang
 lang_rescore=data/lang_rescore_6g
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -108,9 +103,9 @@ if [ $stage -le 3 ]; then
      exit 1;
   fi
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --context-opts "--context-width=2 --central-position=1" \
-    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    --cmd "$cmd" $num_leaves $train_data_dir \
     $lang $ali_dir $tree_dir
 fi
 
@@ -169,7 +164,7 @@ if [ $stage -le 5 ]; then
     --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
     --chain.frame-subsampling-factor=4 \
     --chain.alignment-subsampling-factor=4 \
-    --trainer.srand=$srand \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=4 \
     --trainer.frames-per-iter=2000000 \
@@ -179,7 +174,6 @@ if [ $stage -le 5 ]; then
     --trainer.optimization.final-effective-lrate=0.0001 \
     --trainer.optimization.shrink-value=1.0 \
     --trainer.num-chunk-per-minibatch=64,32 \
-    --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
@@ -199,7 +193,6 @@ if [ $stage -le 6 ]; then
   # topology file from the model).  So you could give it a different
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
-
   utils/mkgraph.sh \
     --self-loop-scale 1.0 $lang_decode \
     $dir $dir/graph || exit 1;
@@ -216,5 +209,5 @@ if [ $stage -le 7 ]; then
                                 data/test $dir/decode_test{,_rescored} || exit 1
 fi
 
-echo "Done. Date: $(date). Results:"
+echo "$0 Done. Date: $(date). Results:"
 local/chain/compare_wer.sh $dir
diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
index 5b3597a3915..23c4d5c2036 100755
--- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
@@ -18,24 +18,19 @@ lats_affix=
 # chain options
 train_stage=-10
 xent_regularize=0.1
-# training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
 tdnn_dim=450
-# training options
-srand=0
 remove_egs=false
 lang_decode=data/lang
 lang_rescore=data/lang_rescore_6g
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -106,13 +101,12 @@ if [ $stage -le 3 ]; then
      exit 1;
   fi
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --context-opts "--context-width=2 --central-position=1" \
-    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    --cmd "$cmd" $num_leaves $train_data_dir \
     $lang $ali_dir $tree_dir
 fi
 
-
 if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
@@ -170,7 +164,7 @@ if [ $stage -le 5 ]; then
     --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
     --chain.frame-subsampling-factor=4 \
     --chain.alignment-subsampling-factor=1 \
-    --trainer.srand=$srand \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=4 \
     --trainer.frames-per-iter=2000000 \
@@ -180,7 +174,6 @@ if [ $stage -le 5 ]; then
     --trainer.optimization.final-effective-lrate=0.0001 \
     --trainer.optimization.shrink-value=1.0 \
     --trainer.num-chunk-per-minibatch=64,32 \
-    --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
@@ -200,7 +193,6 @@ if [ $stage -le 6 ]; then
   # topology file from the model).  So you could give it a different
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
-
   utils/mkgraph.sh \
     --self-loop-scale 1.0 $lang_decode \
     $dir $dir/graph || exit 1;
@@ -217,5 +209,5 @@ if [ $stage -le 7 ]; then
                                 data/test $dir/decode_test{,_rescored} || exit 1
 fi
 
-echo "Done. Date: $(date). Results:"
+echo "$0 Done. Date: $(date). Results:"
 local/chain/compare_wer.sh $dir
diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh
index 2891e50da9e..3caf8ae4494 100755
--- a/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh
@@ -2,25 +2,24 @@
 # Copyright    2017  Hossein Hadian
 
 # This script does end2end chain training (i.e. from scratch)
-
-# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a
-# System                      e2e_cnn_1a
-# WER                              7.81
-# CER                              2.05
-# Final train prob              -0.0812
-# Final valid prob              -0.0708
+# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+# System                      e2e_cnn_1a     e2e_cnn_1a (with extra corpus text)
+# WER                              9.47            5.73
+# WER (rescored)                   8.05            5.67
+# CER                              2.45            1.45
+# CER (rescored)                   2.10            1.42
+# Final train prob              -0.0934         -0.0934
+# Final valid prob              -0.0746         -0.0746
 # Final train prob (xent)
 # Final valid prob (xent)
-# Parameters                      2.94M
+# Parameters                      2.94M           2.94M
 
 # steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/
-# exp/chain/e2e_cnn_1a/: num-iters=98 nj=6..16 num-params=2.9M dim=40->330 combine=-0.073->-0.073 (over 2) logprob:train/valid[64,97,final]=(-0.084,-0.080,-0.081/-0.073,-0.070,-0.071)
-
+# exp/chain/e2e_cnn_1a/: num-iters=98 nj=6..16 num-params=2.9M dim=40->330 combine=-0.071->-0.070 (over 5) logprob:train/valid[64,97,final]=(-0.089,-0.084,-0.093/-0.075,-0.073,-0.075)
 set -e
 
 # configs for 'chain'
 stage=0
-nj=70
 train_stage=-10
 get_egs_stage=-10
 affix=1a
@@ -31,9 +30,6 @@ minibatch_size=150=128,64/300=128,64/600=64,32/1200=32,16
 common_egs_dir=
 cmvn_opts="--norm-means=false --norm-vars=false"
 train_set=train
-lang_decode=data/lang
-lang_rescore=data/lang_rescore_6g
-
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
@@ -67,7 +63,7 @@ if [ $stage -le 0 ]; then
 fi
 
 if [ $stage -le 1 ]; then
-  steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \
                                        --shared-phones true \
                                        --type mono \
                                        data/$train_set $lang $treedir
@@ -107,9 +103,6 @@ EOF
 fi
 
 if [ $stage -le 3 ]; then
-  # no need to store the egs in a shared storage because we always
-  # remove them. Anyway, it takes only 5 minutes to generate them.
-
   steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
     --cmd "$cmd" \
     --feat.cmvn-opts "$cmvn_opts" \
@@ -138,29 +131,3 @@ if [ $stage -le 3 ]; then
     --tree-dir $treedir \
     --dir $dir  || exit 1;
 fi
-
-if [ $stage -le 4 ]; then
-  # The reason we are using data/lang here, instead of $lang, is just to
-  # emphasize that it's not actually important to give mkgraph.sh the
-  # lang directory with the matched topology (since it gets the
-  # topology file from the model).  So you could give it a different
-  # lang directory, one that contained a wordlist and LM of your choice,
-  # as long as phones.txt was compatible.
-
-  utils/mkgraph.sh \
-    --self-loop-scale 1.0 $lang_decode \
-    $dir $dir/graph || exit 1;
-fi
-
-if [ $stage -le 5 ]; then
-  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --nj $nj --cmd "$cmd" \
-    $dir/graph data/test $dir/decode_test || exit 1;
-
-  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
-                                data/test $dir/decode_test{,_rescored} || exit 1
-fi
-
-echo "Done. Date: $(date). Results:"
-local/chain/compare_wer.sh $dir
diff --git a/egs/madcat_ar/v1/local/process_data.py b/egs/madcat_ar/v1/local/process_data.py
index 71f7f39d632..a39bcfa87d3 100755
--- a/egs/madcat_ar/v1/local/process_data.py
+++ b/egs/madcat_ar/v1/local/process_data.py
@@ -210,7 +210,7 @@ def get_line_image_location():
                     image_file_path = os.path.join(location, updated_base_name)
                     line = text_line_word_dict[line_id]
                     text = ' '.join(line)
-                    utt_id = "{}_{}_{}_{}".format(writer_id, str(image_num).zfill(6), base_line_image_file_name, str(line_id).zfill(4))
+                    utt_id = "{}_{}_{}_{}".format(writer_id, str(image_num).zfill(6), base_name, str(line_id).zfill(4))
                     text_fh.write(utt_id + ' ' + text + '\n')
                     utt2spk_fh.write(utt_id + ' ' + writer_id + '\n')
                     image_fh.write(utt_id + ' ' + image_file_path + '\n')
diff --git a/egs/madcat_ar/v1/local/train_lm.sh b/egs/madcat_ar/v1/local/train_lm.sh
index b7fc0b09a46..903b288a834 100755
--- a/egs/madcat_ar/v1/local/train_lm.sh
+++ b/egs/madcat_ar/v1/local/train_lm.sh
@@ -64,6 +64,12 @@ if [ $stage -le 0 ]; then
   # we can later fold the dev data into this.
   cat data/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
 
+  if [ -d "data/local/gigawordcorpus/arb_gw_5/data" ]; then
+    cat data/local/gigawordcorpus/arb_gw_5/data/nhr_arb_combined.txt | \
+      utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
+      | sed 's/@@//g' > ${dir}/data/text/corpus_text.txt
+  fi
+
   # for reporting perplexities, we'll use the "real" dev set.
   # (the validation data is used as ${dir}/data/text/dev.txt to work
   # out interpolation weights.)
@@ -72,7 +78,7 @@ if [ $stage -le 0 ]; then
   cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
 
   # get the wordlist from MADCAT text
-  cat ${dir}/data/text/train.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/text/{train,corpus_text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
   cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
 fi
 
@@ -83,7 +89,7 @@ if [ $stage -le 1 ]; then
   # Note: if you have more than one order, use a certain amount of words as the
   # vocab and want to restrict max memory for 'sort',
   echo "$0: training the unpruned LM"
-  min_counts='train=1'
+  min_counts='corpus_text=2 train=1'
   wordlist=${dir}/data/wordlist
 
   lm_name="`basename ${wordlist}`_${order}"
@@ -103,8 +109,8 @@ fi
 
 if [ $stage -le 2 ]; then
   echo "$0: pruning the LM (to larger size)"
-  # Using 1 million n-grams for a big LM for rescoring purposes.
-  size=1000000
+  # Using 20 million n-grams for a big LM for rescoring purposes.
+  size=20000000
   prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
 
   get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
@@ -114,9 +120,9 @@ fi
 
 if [ $stage -le 3 ]; then
   echo "$0: pruning the LM (to smaller size)"
-  # Using 500k n-grams for a smaller LM for graph building.  Prune from the
+  # Using 10 million n-grams for a smaller LM for graph building.  Prune from the
   # bigger-pruned LM, it'll be faster.
-  size=500000
+  size=10000000
   prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
 
   get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
diff --git a/egs/madcat_ar/v1/run.sh b/egs/madcat_ar/v1/run.sh
index d3937582662..01bfdbed543 100755
--- a/egs/madcat_ar/v1/run.sh
+++ b/egs/madcat_ar/v1/run.sh
@@ -19,15 +19,17 @@ writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab
 writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab
 writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab
 data_splits_dir=data/download/data_splits
+images_scp_dir=data/local
 overwrite=false
+subset=false
+augment=false
+use_extra_corpus_text=true
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
 . ./path.sh
 . ./utils/parse_options.sh  # e.g. this parses the above options
                             # if supplied.
-
 ./local/check_tools.sh
-
 mkdir -p data/{train,test,dev}/data
 mkdir -p data/local/{train,test,dev}
 
@@ -37,10 +39,9 @@ if [ $stage -le 0 ]; then
     echo "Exiting with status 1 to avoid data corruption"
     exit 1;
   fi
-
-  echo "$0: Downloading data splits...$(date)"
-  local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
-                         --download_dir2 $download_dir2 --download_dir3 $download_dir3
+  local/prepare_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
+                         --download_dir2 $download_dir2 --download_dir3 $download_dir3 \
+                         --use_extra_corpus_text $use_extra_corpus_text
 
   for set in test train dev; do
     data_split_file=$data_splits_dir/madcat.$set.raw.lineid
@@ -51,11 +52,11 @@ if [ $stage -le 0 ]; then
         --data data/local/$set --subset $subset --augment $augment || exit 1
   done
 
-  echo "$0: Preparing data..."
+  echo "$0: Processing data..."
   for set in dev train test; do
     local/process_data.py $download_dir1 $download_dir2 $download_dir3 \
       $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \
-      $writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset 
+      $writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset
     image/fix_data_dir.sh data/${set}
   done
 fi
diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh
index bb2b4f86db1..62f4eeb7c71 100755
--- a/egs/madcat_ar/v1/run_end2end.sh
+++ b/egs/madcat_ar/v1/run_end2end.sh
@@ -36,6 +36,8 @@ if [ $stage -le 0 ]; then
     echo "Exiting with status 1 to avoid data corruption"
     exit 1;
   fi
+
+  echo "$0: preparing data...$(date)"
   local/prepare_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
                          --download_dir2 $download_dir2 --download_dir3 $download_dir3 \
                          --use_extra_corpus_text $use_extra_corpus_text
@@ -64,7 +66,7 @@ if [ $stage -le 1 ]; then
   image/get_image2num_frames.py data/train
   image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
 
-  for set in test train; do
+  for set in test dev train; do
     echo "$0: Extracting features and calling compute_cmvn_stats for dataset:  $set. $(date)"
     local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set
     steps/compute_cmvn_stats.sh data/$set || exit 1;
@@ -99,28 +101,33 @@ if [ $stage -le 2 ]; then
 fi
 
 if [ $stage -le 3 ]; then
+  echo "$0: Calling the flat-start chain recipe... $(date)."
+  local/chain/run_e2e_cnn.sh
+fi
+
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+decode_e2e=true
+if [ $stage -le 4 ]; then
   echo "$0: Estimating a language model for decoding..."
   local/train_lm.sh
   utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \
-                     data/local/dict/lexicon.txt data/lang
+                     data/local/dict/lexicon.txt $lang_decode
   utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
-                               data/lang data/lang_rescore_6g
+                               data/lang $lang_rescore
 fi
 
-if [ $stage -le 4 ]; then
-  echo "$0: Calling the flat-start chain recipe... $(date)."
-  local/chain/run_e2e_cnn.sh --nj $nj
-fi
+if [ $stage -le 5 ] && $decode_e2e; then
+  echo "$0: $(date) stage 5: decoding end2end setup..."
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode \
+    exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
 
-if [ $stage -le 5 ]; then
-  echo "$0: Aligning the training data using the e2e chain model...$(date)."
-  steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
-                       --use-gpu false \
-                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
-                       data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
-fi
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --nj $nj --cmd "$cmd" \
+    exp/chain/e2e_cnn_1a/graph data/test exp/chain/e2e_cnn_1a/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+    data/test exp/chain/e2e_cnn_1a/decode_test{,_rescored} || exit 1
 
-if [ $stage -le 6 ]; then
-  echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)"
-  local/chain/run_cnn_e2eali.sh --nj $nj
+  echo "$0: Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
 fi
diff --git a/egs/madcat_zh/v1/local/chain/run_cnn.sh b/egs/madcat_zh/v1/local/chain/run_cnn.sh
new file mode 120000
index 00000000000..df6f0a468c1
--- /dev/null
+++ b/egs/madcat_zh/v1/local/chain/run_cnn.sh
@@ -0,0 +1 @@
+tuning/run_cnn_1a.sh
\ No newline at end of file
diff --git a/egs/madcat_zh/v1/local/chain/run_cnn_chainali.sh b/egs/madcat_zh/v1/local/chain/run_cnn_chainali.sh
new file mode 120000
index 00000000000..86568421fe1
--- /dev/null
+++ b/egs/madcat_zh/v1/local/chain/run_cnn_chainali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_chainali_1b.sh
\ No newline at end of file
diff --git a/egs/madcat_zh/v1/local/chain/run_e2e_cnn.sh b/egs/madcat_zh/v1/local/chain/run_e2e_cnn.sh
new file mode 120000
index 00000000000..d26ba0182ce
--- /dev/null
+++ b/egs/madcat_zh/v1/local/chain/run_e2e_cnn.sh
@@ -0,0 +1 @@
+tuning/run_e2e_cnn_1a.sh
\ No newline at end of file
diff --git a/egs/madcat_zh/v1/local/chain/run_cnn_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh
similarity index 88%
rename from egs/madcat_zh/v1/local/chain/run_cnn_1a.sh
rename to egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh
index 43d083099a3..d17b3e3c9c5 100755
--- a/egs/madcat_zh/v1/local/chain/run_cnn_1a.sh
+++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh
@@ -20,7 +20,7 @@ set -e -o pipefail
 stage=0
 
 nj=50
-train_set=train_60
+train_set=train
 gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
                 # should have alignments for the specified training data.
 nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
@@ -32,28 +32,16 @@ reporting_email=
 # chain options
 train_stage=-10
 xent_regularize=0.1
-frame_subsampling_factor=4
-alignment_subsampling_factor=1
-# training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
-# training options
-srand=0
-remove_egs=false
-lang_test=lang_test
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -122,7 +110,7 @@ if [ $stage -le 3 ]; then
      exit 1;
   fi
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --context-opts "--context-width=2 --central-position=1" \
     --cmd "$cmd" $num_leaves ${train_data_dir} \
     $lang $ali_dir $tree_dir
@@ -183,13 +171,13 @@ if [ $stage -le 5 ]; then
     --chain.leaky-hmm-coefficient=0.1 \
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
-    --chain.lm-opts="--num-extra-lm-states=500" \
-    --chain.frame-subsampling-factor=$frame_subsampling_factor \
-    --chain.alignment-subsampling-factor=$frame_subsampling_factor \
-    --trainer.srand=$srand \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=4 \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
-    --trainer.num-epochs=4 \
-    --trainer.frames-per-iter=1000000 \
+    --trainer.num-epochs=2 \
+    --trainer.frames-per-iter=2000000 \
     --trainer.optimization.num-jobs-initial=3 \
     --trainer.optimization.num-jobs-final=12 \
     --trainer.optimization.initial-effective-lrate=0.001 \
@@ -198,13 +186,9 @@ if [ $stage -le 5 ]; then
     --trainer.num-chunk-per-minibatch=64,32 \
     --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
-    --cleanup.remove-egs=$remove_egs \
+    --cleanup.remove-egs=false \
     --use-gpu=true \
     --reporting.email="$reporting_email" \
     --feat-dir=$train_data_dir \
@@ -222,18 +206,18 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 data/lang_test \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
     --frames-per-chunk $frames_per_chunk \
     --nj $nj --cmd "$cmd" \
-    $dir/graph data/test_60 $dir/decode_test || exit 1;
+    $dir/graph data/test $dir/decode_test || exit 1;
 fi
+
+echo "$0: Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
+
diff --git a/egs/madcat_zh/v1/local/chain/run_cnn_chainali_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh
similarity index 89%
rename from egs/madcat_zh/v1/local/chain/run_cnn_chainali_1a.sh
rename to egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh
index ddc51965423..d53949dd3de 100755
--- a/egs/madcat_zh/v1/local/chain/run_cnn_chainali_1a.sh
+++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh
@@ -12,9 +12,8 @@
 set -e -o pipefail
 
 stage=0
-
 nj=30
-train_set=train_60
+train_set=train
 gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
                 # should have alignments for the specified training data.
 nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
@@ -27,28 +26,18 @@ reporting_email=
 # chain options
 train_stage=-10
 xent_regularize=0.1
-frame_subsampling_factor=4
-alignment_subsampling_factor=1
 # training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
 # we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
-# training options
-srand=0
-remove_egs=false
-lang_test=lang_test
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -118,7 +107,7 @@ if [ $stage -le 3 ]; then
      exit 1;
   fi
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --context-opts "--context-width=2 --central-position=1" \
     --cmd "$cmd" $num_leaves ${train_data_dir} \
     $lang $ali_dir $tree_dir
@@ -137,7 +126,6 @@ if [ $stage -le 4 ]; then
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
   input dim=40 name=input
-
   conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
   conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
   conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
@@ -181,12 +169,12 @@ if [ $stage -le 5 ]; then
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
     --chain.lm-opts="--num-extra-lm-states=500" \
-    --chain.frame-subsampling-factor=$frame_subsampling_factor \
-    --chain.alignment-subsampling-factor=$alignment_subsampling_factor \
-    --trainer.srand=$srand \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=1 \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
-    --trainer.num-epochs=4 \
-    --trainer.frames-per-iter=1000000 \
+    --trainer.num-epochs=2 \
+    --trainer.frames-per-iter=2000000 \
     --trainer.optimization.num-jobs-initial=8 \
     --trainer.optimization.num-jobs-final=16 \
     --trainer.optimization.initial-effective-lrate=0.001 \
@@ -195,13 +183,9 @@ if [ $stage -le 5 ]; then
     --trainer.num-chunk-per-minibatch=64,32 \
     --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
-    --cleanup.remove-egs=$remove_egs \
+    --cleanup.remove-egs=false \
     --use-gpu=true \
     --reporting.email="$reporting_email" \
     --feat-dir=$train_data_dir \
@@ -217,20 +201,15 @@ if [ $stage -le 6 ]; then
   # topology file from the model).  So you could give it a different
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
-
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 data/lang_test \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
     --frames-per-chunk $frames_per_chunk \
     --nj $nj --cmd "$cmd" \
-    $dir/graph data/test_60 $dir/decode_test || exit 1;
+    $dir/graph data/test $dir/decode_test || exit 1;
 fi
diff --git a/egs/madcat_zh/v1/local/chain/run_cnn_chainali_1b.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh
similarity index 88%
rename from egs/madcat_zh/v1/local/chain/run_cnn_chainali_1b.sh
rename to egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh
index bdad37f0c1d..5a3b85422f6 100755
--- a/egs/madcat_zh/v1/local/chain/run_cnn_chainali_1b.sh
+++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh
@@ -17,9 +17,8 @@
 set -e -o pipefail
 
 stage=0
-
 nj=30
-train_set=train_60
+train_set=train
 gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
                 # should have alignments for the specified training data.
 nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
@@ -28,32 +27,20 @@ ali=tri3_ali
 chain_model_dir=exp/chain${nnet3_affix}/cnn${affix}
 common_egs_dir=
 reporting_email=
-
 # chain options
 train_stage=-10
 xent_regularize=0.1
-frame_subsampling_factor=4
-alignment_subsampling_factor=1
 # training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
-# training options
-srand=0
-remove_egs=false
-lang_test=lang_test
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -124,7 +111,7 @@ if [ $stage -le 3 ]; then
      exit 1;
   fi
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --context-opts "--context-width=2 --central-position=1" \
     --cmd "$cmd" $num_leaves ${train_data_dir} \
     $lang $ali_dir $tree_dir
@@ -188,28 +175,24 @@ if [ $stage -le 5 ]; then
     --chain.leaky-hmm-coefficient=0.1 \
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
-    --chain.lm-opts="--num-extra-lm-states=500" \
-    --chain.frame-subsampling-factor=$frame_subsampling_factor \
-    --chain.alignment-subsampling-factor=$alignment_subsampling_factor \
-    --trainer.srand=$srand \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=1 \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
-    --trainer.num-epochs=4 \
-    --trainer.frames-per-iter=1000000 \
-    --trainer.optimization.num-jobs-initial=3 \
-    --trainer.optimization.num-jobs-final=12 \
+    --trainer.num-epochs=2 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.optimization.num-jobs-initial=6 \
+    --trainer.optimization.num-jobs-final=16 \
     --trainer.optimization.initial-effective-lrate=0.001 \
     --trainer.optimization.final-effective-lrate=0.0001 \
     --trainer.optimization.shrink-value=1.0 \
     --trainer.num-chunk-per-minibatch=64,32 \
     --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
-    --cleanup.remove-egs=$remove_egs \
+    --cleanup.remove-egs=false \
     --use-gpu=true \
     --reporting.email="$reporting_email" \
     --feat-dir=$train_data_dir \
@@ -227,18 +210,17 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 data/lang_test \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
     --frames-per-chunk $frames_per_chunk \
     --nj $nj --cmd "$cmd" \
-    $dir/graph data/test_60 $dir/decode_test || exit 1;
+    $dir/graph data/test $dir/decode_test || exit 1;
 fi
+
+echo "$0: Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/madcat_zh/v1/local/chain/run_flatstart_cnn1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh
similarity index 66%
rename from egs/madcat_zh/v1/local/chain/run_flatstart_cnn1a.sh
rename to egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh
index 427086c3be8..ffc9a4c8a14 100755
--- a/egs/madcat_zh/v1/local/chain/run_flatstart_cnn1a.sh
+++ b/egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh
@@ -1,16 +1,18 @@
 #!/bin/bash
 # Copyright    2017  Hossein Hadian
 
-# local/chain/compare_wer.sh exp/chain/cnn_1a/ exp/chain/cnn_chainali_1b/ exp/chain/e2e_cnn_1a/
-# System                         cnn_1a cnn_chainali_1b e2e_cnn_1a
-# WER                             13.51      6.76     10.55
-# Final train prob              -0.0291   -0.0138   -0.0702
-# Final valid prob              -0.0712   -0.0171   -0.0578
-# Final train prob (xent)       -0.3847   -0.4169
-# Final valid prob (xent)       -0.4962   -0.5040
+# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a
+# System                      e2e_cnn_1a
+# WER                             10.41
+# Final train prob              -0.0536
+# Final valid prob              -0.0489
+# Final train prob (xent)
+# Final valid prob (xent)
 
-set -e
+# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/
+# exp/chain/e2e_cnn_1a/: num-iters=63 nj=6..12 num-params=6.1M dim=80->5760 combine=-0.048->-0.048 (over 5) logprob:train/valid[41,62,final]=(-0.062,-0.065,-0.054/-0.058,-0.062,-0.049)
 
+set -e
 # configs for 'chain'
 stage=0
 train_stage=-10
@@ -19,16 +21,9 @@ affix=1a
 
 # training options
 tdnn_dim=450
-num_epochs=4
-num_jobs_initial=3
-num_jobs_final=12
 minibatch_size=150=48,24/300=24,12/600=12,6/1200=4,4
 common_egs_dir=
-l2_regularize=0.00005
-frames_per_iter=1000000
-cmvn_opts="--norm-means=true --norm-vars=true"
 train_set=train
-lang_test=lang_test
 
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
@@ -63,7 +58,7 @@ if [ $stage -le 0 ]; then
 fi
 
 if [ $stage -le 1 ]; then
-  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 70 --cmd "$cmd" \
                                        --shared-phones true \
                                        --type mono \
                                        data/$train_set $lang $treedir
@@ -78,17 +73,12 @@ fi
 if [ $stage -le 2 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
   num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
-
-  opts="l2-regularize=0.075"
-  opts_2="l2-regularize=0.075"
-  opts_3="l2-regularize=0.1"
-  common1="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
-  common2="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
-  common3="$opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
   input dim=80 name=input
-
   conv-relu-batchnorm-layer name=cnn1 height-in=80 height-out=80 time-offsets=-3,-2,-1,0,1,2,3 $common1
   conv-relu-batchnorm-layer name=cnn2 height-in=80 height-out=40 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
   conv-relu-batchnorm-layer name=cnn3 height-in=40 height-out=40 time-offsets=-4,-2,0,2,4 $common2
@@ -97,13 +87,12 @@ if [ $stage -le 2 ]; then
   conv-relu-batchnorm-layer name=cnn6 height-in=20 height-out=20 time-offsets=-1,0,1 $common3
   conv-relu-batchnorm-layer name=cnn7 height-in=20 height-out=20 time-offsets=-1,0,1 $common3
   conv-relu-batchnorm-layer name=cnn8 height-in=20 height-out=10 time-offsets=-1,0,1 $common3 height-subsample-out=2
-  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $opts_2
-  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $opts_2
-  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $opts_2
-
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim
   ## adding the layers for chain branch
-  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $opts_2
-  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $opts_3
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
 EOF
 
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
@@ -115,9 +104,9 @@ if [ $stage -le 3 ]; then
 
   steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
     --cmd "$cmd" \
-    --feat.cmvn-opts "$cmvn_opts" \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
     --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize $l2_regularize \
+    --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --egs.dir "$common_egs_dir" \
     --egs.stage $get_egs_stage \
@@ -125,11 +114,11 @@ if [ $stage -le 3 ]; then
     --chain.frame-subsampling-factor 4 \
     --chain.alignment-subsampling-factor 4 \
     --trainer.num-chunk-per-minibatch $minibatch_size \
-    --trainer.frames-per-iter $frames_per_iter \
-    --trainer.num-epochs $num_epochs \
+    --trainer.frames-per-iter 2000000 \
+    --trainer.num-epochs 2 \
     --trainer.optimization.momentum 0 \
-    --trainer.optimization.num-jobs-initial $num_jobs_initial \
-    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.num-jobs-initial 6 \
+    --trainer.optimization.num-jobs-final 16 \
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.shrink-value 1.0 \
@@ -139,26 +128,3 @@ if [ $stage -le 3 ]; then
     --tree-dir $treedir \
     --dir $dir  || exit 1;
 fi
-
-if [ $stage -le 4 ]; then
-  # The reason we are using data/lang here, instead of $lang, is just to
-  # emphasize that it's not actually important to give mkgraph.sh the
-  # lang directory with the matched topology (since it gets the
-  # topology file from the model).  So you could give it a different
-  # lang directory, one that contained a wordlist and LM of your choice,
-  # as long as phones.txt was compatible.
-
-  utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
-    $dir $dir/graph || exit 1;
-fi
-
-if [ $stage -le 5 ]; then
-  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --nj 30 --cmd "$cmd" \
-    $dir/graph data/test $dir/decode_test || exit 1;
-fi
-
-echo "Done. Date: $(date). Results:"
-local/chain/compare_wer.sh $dir
diff --git a/egs/madcat_zh/v1/local/download_data.sh b/egs/madcat_zh/v1/local/download_data.sh
deleted file mode 100755
index 6b4055f7205..00000000000
--- a/egs/madcat_zh/v1/local/download_data.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-
-# Copyright      2018  Ashish Arora
-# Apache 2.0
-
-# This script downloads data splits for MADCAT Chinese dataset.
-# It also check if madcat chinese data is present or not.
-
-download_dir1=/export/corpora/LDC/LDC2014T13/data
-train_split_url=http://www.openslr.org/resources/50/madcat.train.raw.lineid
-test_split_url=http://www.openslr.org/resources/50/madcat.test.raw.lineid
-dev_split_url=http://www.openslr.org/resources/50/madcat.dev.raw.lineid
-data_split_dir=data/download/datasplits
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh || exit 1;
-
-if [ -d $data_split_dir ]; then
-  echo "$0: Not downloading the data splits as it is already there."
-else
-  if [ ! -f $data_split_dir/madcat.train.raw.lineid ]; then
-    mkdir -p $data_split_dir
-    echo "$0: Downloading the data splits..."
-    wget -P $data_split_dir $train_split_url || exit 1;
-    wget -P $data_split_dir $test_split_url || exit 1;
-    wget -P $data_split_dir $dev_split_url || exit 1;
-  fi
-  echo "$0: Done downloading the data splits"
-fi
-
-if [ -d $download_dir1 ]; then
-  echo "$0: madcat chinese data directory is present."
-else
-  if [ ! -f $download_dir1/madcat/*.madcat.xml ]; then
-    echo "$0: please download madcat data..."
-  fi
-fi
diff --git a/egs/madcat_zh/v1/local/extract_features.sh b/egs/madcat_zh/v1/local/extract_features.sh
index 0660ae4b412..9fe588f31b8 100755
--- a/egs/madcat_zh/v1/local/extract_features.sh
+++ b/egs/madcat_zh/v1/local/extract_features.sh
@@ -1,10 +1,16 @@
 #!/bin/bash
+
 # Copyright   2017 Yiwen Shao
 #             2018 Ashish Arora
 
+# Apache 2.0
+# This script runs the make features script in parallel. 
+
 nj=4
 cmd=run.pl
 feat_dim=40
+augment='no_aug'
+verticle_shift=0
 echo "$0 $@"
 
 . ./cmd.sh
@@ -29,11 +35,11 @@ done
 # split images.scp
 utils/split_scp.pl $scp $split_scps || exit 1;
 
-echo "$0: Preparing the test and train feature files..."
 $cmd JOB=1:$nj $logdir/extract_features.JOB.log \
-  local/make_features.py $logdir/images.JOB.scp \
+  image/ocr/make_features.py $logdir/images.JOB.scp \
     --allowed_len_file_path $data/allowed_lengths.txt \
-    --feat-dim $feat_dim \| \
+    --feat-dim $feat_dim --augment_type $augment \
+    --vertical-shift $verticle_shift \| \
     copy-feats --compress=true --compression-method=7 \
     ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
 
diff --git a/egs/madcat_zh/v1/local/make_features.py b/egs/madcat_zh/v1/local/make_features.py
deleted file mode 100755
index a21276d32c2..00000000000
--- a/egs/madcat_zh/v1/local/make_features.py
+++ /dev/null
@@ -1,138 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright      2017  Chun Chieh Chang
-#                2017  Ashish Arora
-#                2018  Hossein Hadian
-
-""" This script converts images to Kaldi-format feature matrices. The input to
-    this script is the path to a data directory, e.g. "data/train". This script
-    reads the images listed in images.scp and writes them to standard output
-    (by default) as Kaldi-formatted matrices (in text form). It also scales the
-    images so they have the same height (via --feat-dim). It can optionally pad
-    the images (on left/right sides) with white pixels.
-    If an 'image2num_frames' file is found in the data dir, it will be used
-    to enforce the images to have the specified length in that file by padding
-    white pixels (the --padding option will be ignored in this case). This relates
-    to end2end chain training.
-
-    eg. local/make_features.py data/train --feat-dim 40
-"""
-
-import argparse
-import os
-import sys
-import numpy as np
-from scipy import misc
-
-parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and
-                                                writes them to standard output in text format.""")
-parser.add_argument('images_scp_path', type=str,
-                    help='Path of images.scp file')
-parser.add_argument('--allowed_len_file_path', type=str, default=None,
-                    help='If supplied, each images will be padded to reach the '
-                    'target length (this overrides --padding).')
-parser.add_argument('--out-ark', type=str, default='-',
-                    help='Where to write the output feature file')
-parser.add_argument('--feat-dim', type=int, default=40,
-                    help='Size to scale the height of all images')
-parser.add_argument('--padding', type=int, default=5,
-                    help='Number of white pixels to pad on the left'
-                    'and right side of the image.')
-
-
-args = parser.parse_args()
-
-
-def write_kaldi_matrix(file_handle, matrix, key):
-    file_handle.write(key + " [ ")
-    num_rows = len(matrix)
-    if num_rows == 0:
-        raise Exception("Matrix is empty")
-    num_cols = len(matrix[0])
-
-    for row_index in range(len(matrix)):
-        if num_cols != len(matrix[row_index]):
-            raise Exception("All the rows of a matrix are expected to "
-                            "have the same length")
-        file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index])))
-        if row_index != num_rows - 1:
-            file_handle.write("\n")
-    file_handle.write(" ]\n")
-
-
-def get_scaled_image(im):
-    scale_size = args.feat_dim
-    sx = im.shape[1]  # width
-    sy = im.shape[0]  # height
-    scale = (1.0 * scale_size) / sy
-    nx = int(scale_size)
-    ny = int(scale * sx)
-    im = misc.imresize(im, (nx, ny))
-    return im
-
-
-def horizontal_pad(im, allowed_lengths = None):
-    if allowed_lengths is None:
-        left_padding = right_padding = args.padding
-    else:  # Find an allowed length for the image
-        imlen = im.shape[1] # width
-        allowed_len = 0
-        for l in allowed_lengths:
-            if l > imlen:
-                allowed_len = l
-                break
-        if allowed_len == 0:
-            #  No allowed length was found for the image (the image is too long)
-            return None
-        padding = allowed_len - imlen
-        left_padding = int(padding // 2)
-        right_padding = padding - left_padding
-    dim_y = im.shape[0] # height
-    im_pad = np.concatenate((255 * np.ones((dim_y, left_padding),
-                                           dtype=int), im), axis=1)
-    im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding),
-                                                    dtype=int)), axis=1)
-    return im_pad1
-
-
-### main ###
-
-data_list_path = args.images_scp_path
-
-if args.out_ark == '-':
-    out_fh = sys.stdout
-else:
-    out_fh = open(args.out_ark,'wb')
-
-allowed_lengths = None
-allowed_len_handle = args.allowed_len_file_path
-if os.path.isfile(allowed_len_handle):
-    print("Found 'allowed_lengths.txt' file...", file=sys.stderr)
-    allowed_lengths = []
-    with open(allowed_len_handle) as f:
-        for line in f:
-            allowed_lengths.append(int(line.strip()))
-    print("Read {} allowed lengths and will apply them to the "
-          "features.".format(len(allowed_lengths)), file=sys.stderr)
-
-num_fail = 0
-num_ok = 0
-with open(data_list_path) as f:
-    for line in f:
-        line = line.strip()
-        line_vect = line.split(' ')
-        image_id = line_vect[0]
-        image_path = line_vect[1]
-        im = misc.imread(image_path)
-        im_scaled = get_scaled_image(im)
-        im_horizontal_padded = horizontal_pad(im_scaled, allowed_lengths)
-        if im_horizontal_padded is None:
-            num_fail += 1
-            continue
-        data = np.transpose(im_horizontal_padded, (1, 0))
-        data = np.divide(data, 255.0)
-        num_ok += 1
-        write_kaldi_matrix(out_fh, data, image_id)
-
-print('Generated features for {} images. Failed for {} (image too '
-      'long).'.format(num_ok, num_fail), file=sys.stderr)
diff --git a/egs/madcat_zh/v1/local/prepare_data.sh b/egs/madcat_zh/v1/local/prepare_data.sh
index c1accfb5e6c..ba35b90b173 100755
--- a/egs/madcat_zh/v1/local/prepare_data.sh
+++ b/egs/madcat_zh/v1/local/prepare_data.sh
@@ -16,28 +16,33 @@
 #      images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png
 #      spk2utt file: 000 000_a01-000u-00 000_a01-000u-01 000_a01-000u-02 000_a01-000u-03
 
-stage=0
-download_dir=/export/corpora/LDC/LDC2014T13
+download_dir1=/export/corpora/LDC/LDC2014T13/data
+train_split_url=http://www.openslr.org/resources/50/madcat.train.raw.lineid
+test_split_url=http://www.openslr.org/resources/50/madcat.test.raw.lineid
+dev_split_url=http://www.openslr.org/resources/50/madcat.dev.raw.lineid
 data_split_dir=data/download/datasplits
 
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh || exit 1;
 
-if [[ ! -d $download_dir ]]; then
-  echo "$0: Warning: Couldn't find $download_dir."
-  echo ""
+if [ -d $data_split_dir ]; then
+  echo "$0: Not downloading the data splits as it is already there."
+else
+  if [ ! -f $data_split_dir/madcat.train.raw.lineid ]; then
+    mkdir -p $data_split_dir
+    echo "$0: Downloading the data splits..."
+    wget -P $data_split_dir $train_split_url || exit 1;
+    wget -P $data_split_dir $test_split_url || exit 1;
+    wget -P $data_split_dir $dev_split_url || exit 1;
+  fi
+  echo "$0: Done downloading the data splits"
 fi
 
-mkdir -p data/{train,test,dev}/lines
-if [ $stage -le 1 ]; then
-  local/process_data.py $download_dir $data_split_dir/madcat.train.raw.lineid data/train || exit 1
-  local/process_data.py $download_dir $data_split_dir/madcat.test.raw.lineid data/test || exit 1
-  local/process_data.py $download_dir $data_split_dir/madcat.dev.raw.lineid data/dev || exit 1
-
-  for dataset in train test dev; do
-    echo "$0: Fixing data directory for dataset: $dataset"
-    echo "Date: $(date)."
-    image/fix_data_dir.sh data/$dataset
-  done
+if [ -d $download_dir1 ]; then
+  echo "$0: madcat chinese data directory is present."
+else
+  if [ ! -f $download_dir1/madcat/*.madcat.xml ]; then
+    echo "$0: please download madcat data..."
+  fi
 fi
diff --git a/egs/madcat_zh/v1/run.sh b/egs/madcat_zh/v1/run.sh
index f591dcccb35..b3ef370c830 100755
--- a/egs/madcat_zh/v1/run.sh
+++ b/egs/madcat_zh/v1/run.sh
@@ -16,16 +16,36 @@ decode_gmm=true
 # The datasplits can be found on http://www.openslr.org/51/
 madcat_database=/export/corpora/LDC/LDC2014T13
 data_split_dir=data/download/datasplits
+overwrite=false
+corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/zh/
 
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
 . ./path.sh
 . ./utils/parse_options.sh  # e.g. this parses the above options
                             # if supplied.
+./local/check_tools.sh
+
+# Start from stage=-1 for using extra corpus text
+if [ $stage -le -1 ]; then
+  echo "$(date): getting corpus text for language modelling..."
+  mkdir -p data/local/text/cleaned
+  cat $corpus_dir/* > data/local/text/zh.txt
+  head -20000 data/local/text/zh.txt > data/local/text/cleaned/val.txt
+  tail -n +20000 data/local/text/zh.txt > data/local/text/cleaned/corpus.txt
+fi
 
 mkdir -p data/{train,test,dev}/lines
 if [ $stage -le 0 ]; then
-  local/download_data.sh --download-dir1 $madcat_database/data --data-split-dir $data_split_dir
+
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+
+   echo "$0: Preparing data..."
+  local/prepare_data.sh --download-dir1 $madcat_database/data --data-split-dir $data_split_dir
 
   for dataset in train test dev; do
     local/extract_lines.sh --nj $nj --cmd $cmd \
@@ -34,20 +54,18 @@ if [ $stage -le 0 ]; then
       data/${dataset}/lines
   done
 
-  echo "$0: Preparing data..."
-  local/prepare_data.sh --download-dir $madcat_database
+  echo "$0: Processing data..."
+  for set in dev train test; do
+    local/process_data.py $madcat_database $data_split_dir/madcat.$set.raw.lineid data/$set
+    image/fix_data_dir.sh data/$set
+  done
 fi
 
-# This script uses feat-dim of 60 while the end2end version uses a feat-dim of 80
-mkdir -p data/{train_60,test_60,dev_60}/data
+mkdir -p data/{train,test,dev}/data
 if [ $stage -le 1 ]; then
   for dataset in train test dev; do
-    for prepared in utt2spk text images.scp spk2utt; do
-      cp data/$dataset/$prepared data/${dataset}_60/$prepared
-    done
-
-    local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 60 data/${dataset}_60
-    steps/compute_cmvn_stats.sh data/${dataset}_60
+    local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 60 data/$dataset
+    steps/compute_cmvn_stats.sh data/$dataset
   done
 fi
 
@@ -67,56 +85,56 @@ if [ $stage -le 3 ]; then
 fi
 
 if [ $stage -le 4 ]; then
-  steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train_60 \
+  steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train \
     data/lang exp/mono
 fi
 
 if [ $stage -le 5 ] && $decode_gmm; then
   utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph
 
-  steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test_60 \
+  steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \
     exp/mono/decode_test
 fi
 
 if [ $stage -le 6 ]; then
-  steps/align_si.sh --nj $nj --cmd $cmd data/train_60 data/lang \
+  steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \
     exp/mono exp/mono_ali
 
   steps/train_deltas.sh --cmd $cmd --context-opts "--context-width=2 --central-position=1" \
-    50000 20000 data/train_60 data/lang \
+    50000 20000 data/train data/lang \
     exp/mono_ali exp/tri
 fi
 
 if [ $stage -le 7 ] && $decode_gmm; then
   utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph
 
-  steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test_60 \
+  steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \
     exp/tri/decode_test
 fi
 
 if [ $stage -le 8 ]; then
-  steps/align_si.sh --nj $nj --cmd $cmd data/train_60 data/lang \
+  steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \
     exp/tri exp/tri_ali
 
   steps/train_lda_mllt.sh --cmd $cmd \
     --splice-opts "--left-context=3 --right-context=3" \
     --context-opts "--context-width=2 --central-position=1" 50000 20000 \
-    data/train_60 data/lang exp/tri_ali exp/tri2
+    data/train data/lang exp/tri_ali exp/tri2
 fi
 
 if [ $stage -le 9 ] && $decode_gmm; then
   utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
 
   steps/decode.sh --nj $nj --cmd $cmd exp/tri2/graph \
-    data/test_60 exp/tri2/decode_test
+    data/test exp/tri2/decode_test
 fi
 
 if [ $stage -le 10 ]; then
   steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \
-    data/train_60 data/lang exp/tri2 exp/tri2_ali
+    data/train data/lang exp/tri2 exp/tri2_ali
 
   steps/train_sat.sh --cmd $cmd --context-opts "--context-width=2 --central-position=1" \
-    50000 20000 data/train_60 data/lang \
+    50000 20000 data/train data/lang \
     exp/tri2_ali exp/tri3
 fi
 
@@ -124,12 +142,12 @@ if [ $stage -le 11 ] && $decode_gmm; then
   utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph
 
   steps/decode_fmllr.sh --nj $nj --cmd $cmd exp/tri3/graph \
-    data/test_60 exp/tri3/decode_test
+    data/test exp/tri3/decode_test
 fi
 
 if [ $stage -le 12 ]; then
   steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \
-    data/train_60 data/lang exp/tri3 exp/tri3_ali
+    data/train data/lang exp/tri3 exp/tri3_ali
 fi
 
 if [ $stage -le 13 ]; then
diff --git a/egs/madcat_zh/v1/run_end2end.sh b/egs/madcat_zh/v1/run_end2end.sh
index 7f759e54b57..7e0fc1e25d1 100755
--- a/egs/madcat_zh/v1/run_end2end.sh
+++ b/egs/madcat_zh/v1/run_end2end.sh
@@ -12,18 +12,35 @@ password=
 # in "local/prepare_data.sh" to download the database:
 madcat_database=/export/corpora/LDC/LDC2014T13
 data_split_dir=data/download/datasplits
+overwrite=false
+corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/zh/
 
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
 . ./path.sh
 . ./utils/parse_options.sh  # e.g. this parses the above options
                             # if supplied.
+./local/check_tools.sh
 
 
-#./local/check_tools.sh
+# Start from stage=-1 for using extra corpus text
+if [ $stage -le -1 ]; then
+  echo "$(date): getting corpus text for language modelling..."
+  mkdir -p data/local/text/cleaned
+  cat $corpus_dir/* > data/local/text/zh.txt
+  head -20000 data/local/text/zh.txt > data/local/text/cleaned/val.txt
+  tail -n +20000 data/local/text/zh.txt > data/local/text/cleaned/corpus.txt
+fi
 
 if [ $stage -le 0 ]; then
-  local/download_data.sh --download-dir1 $madcat_database/data --data-split-dir $data_split_dir
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+
+  echo "$0: Preparing data..."
+  local/prepare_data.sh --download-dir1 $madcat_database/data --data-split-dir $data_split_dir
 
   for dataset in train test dev; do
     local/extract_lines.sh --nj $nj --cmd $cmd \
@@ -32,18 +49,20 @@ if [ $stage -le 0 ]; then
       data/${dataset}/lines
   done
 
-  echo "$0: Preparing data..."
-  local/prepare_data.sh --download-dir "$madcat_database"
+  echo "$0: Processing data..."
+  for set in dev train test; do
+    local/process_data.py $madcat_database $data_split_dir/madcat.$set.raw.lineid data/$set
+    image/fix_data_dir.sh data/$set
+  done
+
 fi
 
 mkdir -p data/{train,test}/data
 if [ $stage -le 1 ]; then
   image/get_image2num_frames.py --feat-dim 80 data/train  # This will be needed for the next command
-  
   # The next command creates a "allowed_lengths.txt" file in data/train
   # which will be used by local/make_features.py to enforce the images to
   # have allowed lengths. The allowed lengths will be spaced by 10% difference in length.
-  
   image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
   echo "$0: Preparing the test and train feature files..."
   for dataset in train test; do
@@ -62,13 +81,27 @@ if [ $stage -le 2 ]; then
 fi
 
 if [ $stage -le 3 ]; then
+  echo "$0: calling the flat-start chain recipe..."
+  local/chain/run_e2e_cnn.sh
+fi
+
+lang_decode=data/lang_test
+decode_e2e=true
+if [ $stage -le 4 ]; then
   echo "$0: Estimating a language model for decoding..."
   local/train_lm.sh
   utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \
-                     data/local/dict/lexicon.txt data/lang_test
+                     data/local/dict/lexicon.txt $lang_decode
 fi
 
-if [ $stage -le 4 ]; then
-  echo "$0: calling the flat-start chain recipe..."
-  local/chain/run_flatstart_cnn1a.sh
+if [ $stage -le 5 ] && $decode_e2e; then
+  echo "$0: $(date) stage 5: decoding end2end setup..."
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode \
+    exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
+
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --nj $nj --cmd "$cmd" \
+    exp/chain/e2e_cnn_1a/graph data/test exp/chain/e2e_cnn_1a/decode_test || exit 1;
+
+  echo "$0: Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
 fi
diff --git a/egs/mini_librispeech/s5/local/download_lm.sh b/egs/mini_librispeech/s5/local/download_lm.sh
index cab78387d06..b37ae599118 100755
--- a/egs/mini_librispeech/s5/local/download_lm.sh
+++ b/egs/mini_librispeech/s5/local/download_lm.sh
@@ -58,7 +58,7 @@ function check_and_download () {
   return 0
 }
 
-mkdir -p $dst_dir
+mkdir -p $dst_dir $local_dir
 
 for f in 3-gram.arpa.gz 3-gram.pruned.1e-7.arpa.gz 3-gram.pruned.3e-7.arpa.gz  \
          librispeech-vocab.txt librispeech-lexicon.txt; do
diff --git a/egs/reverb/s5/README.txt b/egs/reverb/s5/README.txt
index 1daa214edb6..0ac97059952 100644
--- a/egs/reverb/s5/README.txt
+++ b/egs/reverb/s5/README.txt
@@ -1,130 +1,36 @@
-Improved multi condition training baseline for REVERB challenge based on Kaldi
-==============================================================================
+Improved baseline for REVERB challenge 
+======================================
 
-updated
-Wed Apr 29 19:10:33 EDT 2015 Shinji Watanabe <watanabe@merl.com>
-
-updated 
-Wed Apr  9 12:14:02 CEST 2014 Felix Weninger <felix@weninger.de>
-
-original:
-Wed Nov  6 14:47:59 EST 2013 Felix Weninger <felix@weninger.de>
+This is an improvement over "Improved multi condition training baseline" from Felix Weninger & Shinji Watanabe
 
 Key specs:
-- MFCC-LDA-STC front-end
-- Boosted MMI trained GMM-HMM
-- Utterance-based adaptation using basis fMLLR
-- Tri-gram LM minimum Bayes risk decoding
-
-WER [%]
-@ Language model weight = 15
-Avg(SimData_(far|near)) = 11.73
-Avg(RealData)           = 30.44
-@ Language model weight = 16 (optimal)
-Avg(SimData_(far|near)) = 11.72
-Avg(RealData)           = 30.28
-
-See RESULTS in more detail
-
-Kaldi SVN rev. 5035, 4/26/15
-tested on Ubuntu 13.04
+- Nara-WPE and BeamformIt front-end enhancement
+- TDNN acoustic model
 
+RESULT:
+For experiment results, please see RESULTS for more detail
 
 REFERENCE:
 ++++++++
 If you find this software useful for your own research, please cite the
-following paper:
+following papers:
 
 Felix Weninger, Shinji Watanabe, Jonathan Le Roux, John R. Hershey, Yuuki
 Tachioka, Jürgen Geiger, Björn Schuller, Gerhard Rigoll: "The MERL/MELCO/TUM
 system for the REVERB Challenge using Deep Recurrent Neural Network Feature
 Enhancement", Proc. REVERB Workshop, IEEE, Florence, Italy, May 2014.
 
+Lukas Drude, Jahn Heymann, Christoph Boeddeker, and Reinhold Haeb-Umbach:
+"NARA-WPE: A Python package for weighted prediction error dereverberation in
+Numpy and Tensorflow for online and offline processing." In Speech Communication;
+13th ITG-Symposium, pp. 1-5. VDE, 2018.
 
 INSTRUCTIONS:
 +++++++++++++
-
-1) Set the path names in corpus.sh.default, 
-   and copy this file to "corpus.sh"
-
------
-2) [optional:] If you have speech enhancement (processed waveforms), then
-
-3a) Change directories and data preparation steps
-    For example, you could have something like
-
-    local/REVERB_wsjcam0_data_prep.sh /path/to/processed/REVERB_WSJCAM0_dt REVERB_dt_derev dt
-
-    The first argument is supposed to point to a folder that has the same
-    structure as the REVERB corpus.
-
-3b) run the multi-condition training steps in run.sh with the processed
-    training set, e.g., REVERB_tr_cut_derev, if you want to investigate
-    recognizer re-training
-
-    - Any system that has _mc in its name uses multi-condition training
-    - You probably want to change the system names if you are using enhanced
-      data for training (e.g. tri2b_mc -> tri2b_mc_derev)
-
-3c) Add your re-trained recognizer to the list of recognizers that are
-    discriminatively re-trained
-
-3d) Modify the decoding steps in run.sh so that they use enhanced data and add
-    your re-trained recognizer(s) to the list
------
-
-4) Execute the training and recognition steps by
+1) Execute the training and recognition steps by
 
    ./run.sh
 
    Depending on your system specs (# of CPUs, RAM) you might want (or have) to 
-   change the number of parallel jobs -- this is controlled by the nj_train,
-   nj_bg, and nj_tg variables (# of jobs for training, for bi-gram and tri-gram
-   decoding).
-
-   If you also want to have the re-implementation of the HTK baseline in Kaldi 
-   (tri2a and tri2a_mc systems), set the do_tri2a variable to true in run.sh.
-
-5) Execute 
-
-   ./local/get_results.sh 
-
-   to display the results corresponding to Table 1 in
-   the following paper,
-
-   Felix Weninger, Shinji Watanabe, Jonathan Le Roux, John R. Hershey, Yuuki
-   Tachioka, Jürgen Geiger, Björn Schuller, Gerhard Rigoll: "The MERL/MELCO/TUM
-   system for the REVERB Challenge using Deep Recurrent Neural Network Feature
-   Enhancement", to appear in Proc. REVERB Workshop, IEEE, Florence, Italy, 2014.
-
-   NOTE: It is very common to have slightly different results (up to +/- 1%
-   absolute WER per REVERB task file) on different machines.  The reason for
-   this is not fully known.
-
-   NOTE 2: By default, only the LDA-STC systems are trained - set do_tri2a in
-   run.sh to true to also train the Delta+Delta-Delta systems (cf. above).
-
------
-6) You can get more recognition results (for other combinations of front-ends, 
-   adaptation, language model, etc.), by 
-
-   $> local/summarize_results.pl [options] <system_name> [ <decoding_prefix> [ <data_suffix ] ]
-
-   where system_name is, e.g., tri2b_mc, or tri2b_mc_derev 
-   (a hypothetical system trained on dereverberated data)
-   
-   decoding_prefix: one of basis_fmllr, mbr, mbr_basis_fmllr, or '' (empty)
-    - if the string "basis_fmllr" is given, (basis) fMLLR results are displayed
-    - if mbr is given, minimum Bayes risk decoding results are displayed
-    - if '' is given, no adaptation is used and ML decoding is used
-   
-   data_suffix is, e.g., "derev" if your data sets are named "REVERB_dt_derev", etc.
-   
-   By default, the optimum language model weight across all conditions is selected and
-   displayed. Note that Table 1 in the above paper uses a constant weight of 15.
-   
-   Options: 
-   --lmw=x      Set fixed language model weight instead of best, x \in { 9, ..., 20 }
-   --lm=xg_5k   Display tri-gram (x=t) or bi-gram (x=b) LM decoding results
-   ----
-
+   change the number of parallel jobs -- this is controlled by the nj
+   and decode_nj variables (# of jobs for training, for decoding).
diff --git a/egs/reverb/s5/local/download_se_eval_tool.sh b/egs/reverb/s5/local/download_se_eval_tool.sh
index c7b272907b6..0d7bb8305ea 100755
--- a/egs/reverb/s5/local/download_se_eval_tool.sh
+++ b/egs/reverb/s5/local/download_se_eval_tool.sh
@@ -18,14 +18,14 @@ unzip REVERB_scores.zip -d local/REVERB_scores_source
 rm REVERB_scores.zip
 
 pushd local/REVERB_scores_source/REVERB-SPEENHA.Release04Oct/evaltools
-sed -i 's/wavread/audioread/g' prog/score_sim.m
+perl -i -pe 's/wavread/audioread/g' prog/score_sim.m
 git clone https://github.com/MuSAELab/SRMRToolbox.git
-sed -i 's/wavread/audioread/g' SRMRToolbox/libs/preprocess.m
-sed -i 's/SRMR_main/SRMR/g' prog/score_real.m
-sed -i 's/SRMR_main/SRMR/g' prog/score_sim.m
-sed -i 's/+wb\ //g' prog/calcpesq.m
-sed -i 's/pesq_/_pesq_/g' prog/calcpesq.m
-sed -ie '30d;31d' prog/calcpesq.m
+perl -i -pe 's/wavread/audioread/g' SRMRToolbox/libs/preprocess.m
+perl -i -pe 's/SRMR_main/SRMR/g' prog/score_real.m
+perl -i -pe 's/SRMR_main/SRMR/g' prog/score_sim.m
+perl -i -pe 's/\+wb //g' prog/calcpesq.m
+perl -i -pe 's/pesq_/_pesq_/g' prog/calcpesq.m
+perl -n -i -e 'print unless /remove target file name/' prog/calcpesq.m
 patch score_RealData.m -i ../../../score_RealData.patch -o score_RealData_new.m
 mv score_RealData_new.m score_RealData.m
 patch score_SimData.m -i ../../../score_SimData.patch -o score_SimData_new.m
diff --git a/egs/rm/README.txt b/egs/rm/README.txt
index ed588e481c6..4fa3d7c87e8 100644
--- a/egs/rm/README.txt
+++ b/egs/rm/README.txt
@@ -9,7 +9,7 @@ About the Resource Management corpus:
 
 Each subdirectory of this directory contains the
 scripts for a sequence of experiments. 
-s5 is the currently recommmended setup.
+s5 is the currently recommended setup.
 
   s5: This is the "new-new-style" recipe.  It is now finished.
       All further work will be on top of this style of recipe.  Note: 
diff --git a/egs/sitw/v1/local/make_musan.py b/egs/sitw/v1/local/make_musan.py
index c4b5c9359b4..833da0619c9 100755
--- a/egs/sitw/v1/local/make_musan.py
+++ b/egs/sitw/v1/local/make_musan.py
@@ -49,7 +49,7 @@ def prepare_music(root_dir, use_vocals):
     else:
       print("Missing file {}".format(utt))
       num_bad_files += 1
-  print(("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
+  print("In music directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
   return utt2spk_str, utt2wav_str
 
 def prepare_speech(root_dir):
@@ -75,7 +75,7 @@ def prepare_speech(root_dir):
     else:
       print("Missing file {}".format(utt))
       num_bad_files += 1
-  print(("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
+  print("In speech directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
   return utt2spk_str, utt2wav_str
 
 def prepare_noise(root_dir):
@@ -101,7 +101,7 @@ def prepare_noise(root_dir):
     else:
       print("Missing file {}".format(utt))
       num_bad_files += 1
-  print(("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
+  print("In noise directory, processed {} files: {} had missing wav data".format(num_good_files, num_bad_files))
   return utt2spk_str, utt2wav_str
 
 def main():
diff --git a/egs/sitw/v1/run.sh b/egs/sitw/v1/run.sh
index 68d08dfc170..e016f8a4752 100755
--- a/egs/sitw/v1/run.sh
+++ b/egs/sitw/v1/run.sh
@@ -122,7 +122,7 @@ if [ $stage -le 4 ]; then
 
   # Make a reverberated version of the VoxCeleb2 list.  Note that we don't add any
   # additive noise here.
-  python steps/data/reverberate_data_dir.py \
+  steps/data/reverberate_data_dir.py \
     "${rvb_opts[@]}" \
     --speech-rvb-probability 1 \
     --pointsource-noise-addition-probability 0 \
@@ -147,11 +147,11 @@ if [ $stage -le 4 ]; then
   done
 
   # Augment with musan_noise
-  python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train_100k data/train_100k_noise
+  steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train_100k data/train_100k_noise
   # Augment with musan_music
-  python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train_100k data/train_100k_music
+  steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train_100k data/train_100k_music
   # Augment with musan_speech
-  python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train_100k data/train_100k_babble
+  steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train_100k data/train_100k_babble
 
   # Combine reverb, noise, music, and babble into one directory.
   utils/combine_data.sh data/train_aug data/train_100k_reverb data/train_100k_noise data/train_100k_music data/train_100k_babble
diff --git a/egs/sitw/v2/run.sh b/egs/sitw/v2/run.sh
index 499d436366a..8aeecc18b3f 100755
--- a/egs/sitw/v2/run.sh
+++ b/egs/sitw/v2/run.sh
@@ -88,7 +88,7 @@ if [ $stage -le 2 ]; then
 
   # Make a reverberated version of the VoxCeleb2 list.  Note that we don't add any
   # additive noise here.
-  python steps/data/reverberate_data_dir.py \
+  steps/data/reverberate_data_dir.py \
     "${rvb_opts[@]}" \
     --speech-rvb-probability 1 \
     --pointsource-noise-addition-probability 0 \
@@ -113,11 +113,11 @@ if [ $stage -le 2 ]; then
   done
 
   # Augment with musan_noise
-  python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise
+  steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise
   # Augment with musan_music
-  python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music
+  steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music
   # Augment with musan_speech
-  python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble
+  steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble
 
   # Combine reverb, noise, music, and babble into one directory.
   utils/combine_data.sh data/train_aug data/train_reverb data/train_noise data/train_music data/train_babble
diff --git a/egs/sre08/v1/local/score_sre08.sh b/egs/sre08/v1/local/score_sre08.sh
index 92831502f45..c1584946735 100755
--- a/egs/sre08/v1/local/score_sre08.sh
+++ b/egs/sre08/v1/local/score_sre08.sh
@@ -35,11 +35,11 @@ tot_eer=0.0
 printf '% 12s' 'EER:'
 for condition in $(seq 8); do
   eer=$(awk '{print $3}' $scores | paste - $trials | awk -v c=$condition '{n=4+c; if ($n == "Y") print $1, $4}' | compute-eer - 2>/dev/null)
-  tot_eer=$(echo "$tot_eer+$eer" | bc)
+  tot_eer=$(perl -e "print ($tot_eer+$eer);")
   eers[$condition]=$eer
 done
 
-eers[0]=$(echo "$tot_eer/8" | bc -l)
+eers[0]=$(perl -e "print ($tot_eer/8.0);")
 
 for i in $(seq 0 8); do
   printf '% 7.2f' ${eers[$i]}
diff --git a/egs/sre16/v1/run.sh b/egs/sre16/v1/run.sh
index 52ee86ec5b2..28481e27c3a 100755
--- a/egs/sre16/v1/run.sh
+++ b/egs/sre16/v1/run.sh
@@ -130,7 +130,7 @@ if [ $stage -le 4 ]; then
 
   # Make a reverberated version of the SRE list.  Note that we don't add any
   # additive noise here.
-  python steps/data/reverberate_data_dir.py \
+  steps/data/reverberate_data_dir.py \
     "${rvb_opts[@]}" \
     --speech-rvb-probability 1 \
     --pointsource-noise-addition-probability 0 \
@@ -155,11 +155,11 @@ if [ $stage -le 4 ]; then
   done
 
   # Augment with musan_noise
-  python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/sre data/sre_noise
+  steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/sre data/sre_noise
   # Augment with musan_music
-  python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/sre data/sre_music
+  steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/sre data/sre_music
   # Augment with musan_speech
-  python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/sre data/sre_babble
+  steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/sre data/sre_babble
 
   # Combine reverb, noise, music, and babble into one directory.
   utils/combine_data.sh data/sre_aug data/sre_reverb data/sre_noise data/sre_music data/sre_babble
diff --git a/egs/sre16/v2/run.sh b/egs/sre16/v2/run.sh
index f1d9eb72ddc..b2072dfd69d 100755
--- a/egs/sre16/v2/run.sh
+++ b/egs/sre16/v2/run.sh
@@ -120,7 +120,7 @@ if [ $stage -le 2 ]; then
 
   # Make a reverberated version of the SWBD+SRE list.  Note that we don't add any
   # additive noise here.
-  python steps/data/reverberate_data_dir.py \
+  steps/data/reverberate_data_dir.py \
     "${rvb_opts[@]}" \
     --speech-rvb-probability 1 \
     --pointsource-noise-addition-probability 0 \
@@ -145,11 +145,11 @@ if [ $stage -le 2 ]; then
   done
 
   # Augment with musan_noise
-  python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/swbd_sre data/swbd_sre_noise
+  steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/swbd_sre data/swbd_sre_noise
   # Augment with musan_music
-  python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/swbd_sre data/swbd_sre_music
+  steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/swbd_sre data/swbd_sre_music
   # Augment with musan_speech
-  python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/swbd_sre data/swbd_sre_babble
+  steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/swbd_sre data/swbd_sre_babble
 
   # Combine reverb, noise, music, and babble into one directory.
   utils/combine_data.sh data/swbd_sre_aug data/swbd_sre_reverb data/swbd_sre_noise data/swbd_sre_music data/swbd_sre_babble
diff --git a/egs/swbd/s5c/local/score_sclite_conf.sh b/egs/swbd/s5c/local/score_sclite_conf.sh
index 9a1fa5083bf..21da4520a4d 100755
--- a/egs/swbd/s5c/local/score_sclite_conf.sh
+++ b/egs/swbd/s5c/local/score_sclite_conf.sh
@@ -39,6 +39,12 @@ for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \
   [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
 done
 
+if [ -f $dir/../frame_subsampling_factor ]; then
+  factor=$(cat $dir/../frame_subsampling_factor) || exit 1
+  frame_shift_opt="--frame-shift=0.0$factor"
+  echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
+fi
+
 name=`basename $data`; # e.g. eval2000
 
 mkdir -p $dir/scoring/log
@@ -51,7 +57,7 @@ if [ $stage -le 0 ]; then
       ACWT=\`perl -e \"print 1.0/LMWT\;\"\` '&&' \
       lattice-add-penalty --word-ins-penalty=$wip "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
       lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
-      lattice-to-ctm-conf --decode-mbr=$decode_mbr --acoustic-scale=\$ACWT  ark:- - \| \
+      lattice-to-ctm-conf $frame_shift_opt --decode-mbr=$decode_mbr --acoustic-scale=\$ACWT  ark:- - \| \
       utils/int2sym.pl -f 5 $lang/words.txt  \| \
       utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
       '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
diff --git a/egs/swbd/s5c/local/swbd1_map_words.pl b/egs/swbd/s5c/local/swbd1_map_words.pl
index 39f90d72816..125e4de0d61 100755
--- a/egs/swbd/s5c/local/swbd1_map_words.pl
+++ b/egs/swbd/s5c/local/swbd1_map_words.pl
@@ -44,7 +44,7 @@
       # which is a  mistake in the input.
       $a =~ s:^\{(.+)\}$:$1:;                 # e.g. {YUPPIEDOM} -> YUPPIEDOM
       $a =~ s:[A-Z]\[([^][])+\][A-Z]:$1-$3:i; # e.g. AMMU[N]IT- -> AMMU-IT-
-      $a =~ s:_\d$::;                         # e.g. THEM_1 -> THEM 
+      $a =~ s:_\d::;                         # e.g. THEM_1 -> THEM, THEM_1's -> THEM's
     }
     $A[$n] = $a;
   }
diff --git a/egs/tedlium/s5_r3/RESULTS b/egs/tedlium/s5_r3/RESULTS
new file mode 100644
index 00000000000..b2f9526a8fd
--- /dev/null
+++ b/egs/tedlium/s5_r3/RESULTS
@@ -0,0 +1,32 @@
+# This RESULTS file was obtained by running ./run.sh and then ./result.sh
+
+%WER 28.32 [ 5037 / 17783, 615 ins, 1171 del, 3251 sub ] exp/tri1/decode_nosp_dev/wer_10
+%WER 26.99 [ 4799 / 17783, 603 ins, 1169 del, 3027 sub ] exp/tri1/decode_nosp_dev_rescore/wer_10
+%WER 27.76 [ 7634 / 27500, 776 ins, 1689 del, 5169 sub ] exp/tri1/decode_nosp_test/wer_11
+%WER 26.52 [ 7292 / 27500, 766 ins, 1611 del, 4915 sub ] exp/tri1/decode_nosp_test_rescore/wer_11
+%WER 23.38 [ 4158 / 17783, 603 ins, 953 del, 2602 sub ] exp/tri2/decode_dev/wer_14
+%WER 21.98 [ 3909 / 17783, 597 ins, 910 del, 2402 sub ] exp/tri2/decode_dev_rescore/wer_14
+%WER 24.12 [ 4289 / 17783, 600 ins, 1014 del, 2675 sub ] exp/tri2/decode_nosp_dev/wer_12
+%WER 22.96 [ 4083 / 17783, 631 ins, 931 del, 2521 sub ] exp/tri2/decode_nosp_dev_rescore/wer_11
+%WER 23.30 [ 6408 / 27500, 727 ins, 1375 del, 4306 sub ] exp/tri2/decode_nosp_test/wer_13
+%WER 22.10 [ 6078 / 27500, 746 ins, 1281 del, 4051 sub ] exp/tri2/decode_nosp_test_rescore/wer_12
+%WER 22.31 [ 6134 / 27500, 794 ins, 1148 del, 4192 sub ] exp/tri2/decode_test/wer_13
+%WER 21.06 [ 5791 / 27500, 737 ins, 1147 del, 3907 sub ] exp/tri2/decode_test_rescore/wer_14
+%WER 19.99 [ 3554 / 17783, 570 ins, 816 del, 2168 sub ] exp/tri3_cleaned/decode_dev/wer_16
+%WER 18.92 [ 3364 / 17783, 588 ins, 791 del, 1985 sub ] exp/tri3_cleaned/decode_dev_rescore/wer_15
+%WER 23.85 [ 4241 / 17783, 686 ins, 874 del, 2681 sub ] exp/tri3_cleaned/decode_dev.si/wer_13
+%WER 17.73 [ 4876 / 27500, 700 ins, 935 del, 3241 sub ] exp/tri3_cleaned/decode_test/wer_16
+%WER 16.72 [ 4599 / 27500, 686 ins, 906 del, 3007 sub ] exp/tri3_cleaned/decode_test_rescore/wer_16
+%WER 22.10 [ 6077 / 27500, 864 ins, 1075 del, 4138 sub ] exp/tri3_cleaned/decode_test.si/wer_13
+%WER 19.63 [ 3490 / 17783, 585 ins, 809 del, 2096 sub ] exp/tri3/decode_dev/wer_15
+%WER 18.56 [ 3300 / 17783, 558 ins, 817 del, 1925 sub ] exp/tri3/decode_dev_rescore/wer_16
+%WER 23.75 [ 4224 / 17783, 661 ins, 917 del, 2646 sub ] exp/tri3/decode_dev.si/wer_14
+%WER 17.92 [ 4928 / 27500, 730 ins, 921 del, 3277 sub ] exp/tri3/decode_test/wer_14
+%WER 16.80 [ 4621 / 27500, 650 ins, 973 del, 2998 sub ] exp/tri3/decode_test_rescore/wer_17
+%WER 22.16 [ 6095 / 27500, 849 ins, 1070 del, 4176 sub ] exp/tri3/decode_test.si/wer_13
+%WER 8.17 [ 1453 / 17783, 242 ins, 310 del, 901 sub ] exp/chain_cleaned/tdnnf_1a/decode_dev/wer_9
+%WER 7.61 [ 1354 / 17783, 236 ins, 300 del, 818 sub ] exp/chain_cleaned/tdnnf_1a/decode_dev_rescore/wer_9
+%WER 6.17 [ 1097 / 17783, 207 ins, 292 del, 598 sub ] exp/chain_cleaned/tdnnf_1a/decode_dev_rnnlm_lstm_tdnn_a_averaged/wer_10
+%WER 8.16 [ 2245 / 27500, 288 ins, 605 del, 1352 sub ] exp/chain_cleaned/tdnnf_1a/decode_test/wer_9
+%WER 7.75 [ 2131 / 27500, 264 ins, 643 del, 1224 sub ] exp/chain_cleaned/tdnnf_1a/decode_test_rescore/wer_10
+%WER 6.84 [ 1880 / 27500, 283 ins, 533 del, 1064 sub ] exp/chain_cleaned/tdnnf_1a/decode_test_rnnlm_lstm_tdnn_a_averaged/wer_8
diff --git a/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh
index 32252db937d..73a684b6379 100755
--- a/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh
+++ b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh
@@ -30,7 +30,6 @@ epochs=20
 [ -z "$cmd" ] && cmd=$train_cmd
 
 text_from_audio=data/train/text
-text=data/LM/train.txt
 wordlist=data/lang_chain/words.txt
 dev_sents=10000
 text_dir=data/rnnlm/text
@@ -44,8 +43,9 @@ done
 
 if [ $stage -le 0 ]; then
   mkdir -p $text_dir
+  gunzip -c db/TEDLIUM_release-3/LM/*.en.gz | sed 's/ <\/s>//g' > $text_dir/train.txt
   # shuffle text from audio and lm
-  cat $text_from_audio | cut -d ' ' -f2- | cat $text |\
+  cat $text_from_audio | cut -d ' ' -f2- | cat $text_dir/train.txt |\
     shuf > data/rnnlm/full_lm_data.shuffled
   # create dev and train sets based on audio and LM data
   cat data/rnnlm/full_lm_data.shuffled | head -n $dev_sents> $text_dir/dev.txt
diff --git a/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh
index 431d44c6ff6..6cbcaaa85ee 100755
--- a/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh
+++ b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh
@@ -14,7 +14,7 @@ wget --continue http://kaldi-asr.org/models/5/tedlium_rnnlm.tgz -P exp/rnnlm_lst
 cd exp/rnnlm_lstm_tdnn_a_averaged
 tar -xvzf tedlium_rnnlm.tgz || exit 1
 rm tedlium_rnnlm.tgz
-mkdir config
+mkdir -p config
 cd ../..
 cp data/lang/words.txt exp/rnnlm_lstm_tdnn_a_averaged/config/words.txt
 echo "<brk> 152217" >> exp/rnnlm_lstm_tdnn_a_averaged/config/words.txt
diff --git a/egs/tedlium/s5_r3/results.sh b/egs/tedlium/s5_r3/results.sh
index 98bcab94ec5..3e318cb4bc7 100755
--- a/egs/tedlium/s5_r3/results.sh
+++ b/egs/tedlium/s5_r3/results.sh
@@ -1,10 +1,25 @@
 #!/bin/bash
 
+# The output of this script (after successfully running ./run.sh) can be found in the RESULTS file.
+
 filter_regexp=.
 [ $# -ge 1 ] && filter_regexp=$1
 
-for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null
-  for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
-   for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
+for x in exp/*/decode*; do 
+  [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; 
+done 2>/dev/null
+
+for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/decode*; do 
+  [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; 
+done 2>/dev/null | grep $filter_regexp
+
+for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/*/decode*; do 
+  [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; 
+done 2>/dev/null | grep $filter_regexp
+
+for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/*/decode*; do 
+  [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; 
+done 2>/dev/null | grep $filter_regexp
+
 exit 0
 
diff --git a/egs/timit/s5/local/timit_data_prep.sh b/egs/timit/s5/local/timit_data_prep.sh
index 6248fc0368a..be2d6725952 100755
--- a/egs/timit/s5/local/timit_data_prep.sh
+++ b/egs/timit/s5/local/timit_data_prep.sh
@@ -70,7 +70,7 @@ for x in train dev test; do
   find $*/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.WAV' \
     | grep -f $tmpdir/${x}_spk > ${x}_sph.flist
 
-  sed -e 's:.*/\(.*\)/\(.*\).WAV$:\1_\2:' ${x}_sph.flist \
+  sed -e 's:.*/\(.*\)/\(.*\).\(WAV\|wav\)$:\1_\2:' ${x}_sph.flist \
     > $tmpdir/${x}_sph.uttids
   paste $tmpdir/${x}_sph.uttids ${x}_sph.flist \
     | sort -k1,1 > ${x}_sph.scp
@@ -82,7 +82,7 @@ for x in train dev test; do
   # ID followed by the transcript.
   find $*/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.PHN' \
     | grep -f $tmpdir/${x}_spk > $tmpdir/${x}_phn.flist
-  sed -e 's:.*/\(.*\)/\(.*\).PHN$:\1_\2:' $tmpdir/${x}_phn.flist \
+  sed -e 's:.*/\(.*\)/\(.*\).\(PHN\|phn\)$:\1_\2:' $tmpdir/${x}_phn.flist \
     > $tmpdir/${x}_phn.uttids
   while read line; do
     [ -f $line ] || error_exit "Cannot find transcription file '$line'";
diff --git a/egs/uw3/v1/local/process_data.py b/egs/uw3/v1/local/process_data.py
index 3643c0aca89..23b8e5402cf 100755
--- a/egs/uw3/v1/local/process_data.py
+++ b/egs/uw3/v1/local/process_data.py
@@ -52,10 +52,10 @@
       # The dataset is randomly split train 95% and test 5%
       coin = random.randint(0, 20)
       if coin >= 1:
-        train_text_fh.write(utt_id + ' ' + text + '\n')
+        train_text_fh.write("{} {}\n".format(utt_id, text))
         train_utt2spk_fh.write("{} {}\n".format(utt_id, page_count))
-        train_image_fh.write("{} {}\n".format(utt_id, image_path)
+        train_image_fh.write("{} {}\n".format(utt_id, image_path))
       elif coin < 1:
         test_text_fh.write("{} {}\n".format(utt_id, text))
         test_utt2spk_fh.write("{} {}\n".format(utt_id, page_count))
-        train_image_fh.write("{} {}\n".format(utt_id, image_path)
+        train_image_fh.write("{} {}\n".format(utt_id, image_path))
diff --git a/egs/voxceleb/v2/run.sh b/egs/voxceleb/v2/run.sh
index f8c50d7f9df..37bb60fe35c 100755
--- a/egs/voxceleb/v2/run.sh
+++ b/egs/voxceleb/v2/run.sh
@@ -66,7 +66,7 @@ if [ $stage -le 2 ]; then
 
   # Make a reverberated version of the VoxCeleb2 list.  Note that we don't add any
   # additive noise here.
-  python steps/data/reverberate_data_dir.py \
+  steps/data/reverberate_data_dir.py \
     "${rvb_opts[@]}" \
     --speech-rvb-probability 1 \
     --pointsource-noise-addition-probability 0 \
@@ -91,11 +91,11 @@ if [ $stage -le 2 ]; then
   done
 
   # Augment with musan_noise
-  python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise
+  steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise
   # Augment with musan_music
-  python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music
+  steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music
   # Augment with musan_speech
-  python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble
+  steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble
 
   # Combine reverb, noise, music, and babble into one directory.
   utils/combine_data.sh data/train_aug data/train_reverb data/train_noise data/train_music data/train_babble
diff --git a/egs/voxforge/s5/local/voxforge_prepare_dict.sh b/egs/voxforge/s5/local/voxforge_prepare_dict.sh
index 4242af29d25..daf4e2326e5 100755
--- a/egs/voxforge/s5/local/voxforge_prepare_dict.sh
+++ b/egs/voxforge/s5/local/voxforge_prepare_dict.sh
@@ -49,7 +49,7 @@ if [[ "$(uname)" == "Darwin" ]]; then
   alias readlink=greadlink
 fi
 
-sequitur=$KALDI_ROOT/tools/sequitur
+sequitur=$KALDI_ROOT/tools/sequitur-g2p
 export PATH=$PATH:$sequitur/bin
 export PYTHONPATH=$PYTHONPATH:`utils/make_absolute.sh $sequitur/lib/python*/site-packages`
 
diff --git a/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh b/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh
index 9a4f0c87c8d..1ddb3c305ac 100755
--- a/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh
+++ b/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh
@@ -3,33 +3,31 @@
 
 # This script performs chain training in a flat-start manner
 # and without building or using any context-dependency tree.
-# It does not use ivecors or other forms of speaker adaptation
-# except simple mean and variance normalization.
+# It does not use ivecors or other forms of speaker adaptation.
 # It is called from run_e2e_phone.sh
 
 # Note: this script is configured as phone-based, if you want
 # to run it in character mode, you'll need to change _nosp
-# to _char everywhere and also copy char_lm.fst instead
-# of phone_lm.fst (in stage 1 below)
-
-# local/chain/compare_wer.sh exp/chain/e2e_tdnn_1a
-# System                   e2e_tdnn_1a
-#WER dev93 (tgpr)                9.63
-#WER dev93 (tg)                  9.07
-#WER dev93 (big-dict,tgpr)       7.41
-#WER dev93 (big-dict,fg)         6.55
-#WER eval92 (tgpr)               5.90
-#WER eval92 (tg)                 5.17
-#WER eval92 (big-dict,tgpr)      3.56
-#WER eval92 (big-dict,fg)        2.85
-# Final train prob        -0.0726
-# Final valid prob        -0.0884
+# to _char everywhere.
+
+# local/chain/compare_wer.sh exp/chain/e2e_tdnnf_1a
+# System                e2e_tdnnf_1a
+#WER dev93 (tgpr)                8.77
+#WER dev93 (tg)                  8.11
+#WER dev93 (big-dict,tgpr)       6.17
+#WER dev93 (big-dict,fg)         5.66
+#WER eval92 (tgpr)               5.62
+#WER eval92 (tg)                 5.19
+#WER eval92 (big-dict,tgpr)      3.23
+#WER eval92 (big-dict,fg)        2.80
+# Final train prob        -0.0618
+# Final valid prob        -0.0825
 # Final train prob (xent)
 # Final valid prob (xent)
-# Num-params                 3740934
+# Num-params                 6772564
 
-# steps/info/chain_dir_info.pl exp/chain/e2e_tdnn_1a
-# exp/chain/e2e_tdnn_1a: num-iters=102 nj=2..5 num-params=3.7M dim=40->84 combine=-0.117->-0.116 (over 3) logprob:train/valid[67,101,final]=(-0.080,-0.073,-0.073/-0.090,-0.089,-0.088)
+# steps/info/chain_dir_info.pl exp/chain/e2e_tdnnf_1a
+# exp/chain/e2e_tdnnf_1a: num-iters=180 nj=2..8 num-params=6.8M dim=40->84 combine=-0.060->-0.060 (over 3) logprob:train/valid[119,179,final]=(-0.080,-0.062,-0.062/-0.089,-0.083,-0.083)
 
 set -e
 
@@ -40,15 +38,15 @@ get_egs_stage=-10
 affix=1a
 
 # training options
-num_epochs=4
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+num_epochs=10
 num_jobs_initial=2
-num_jobs_final=5
-minibatch_size=150=128,64/300=100,64,32/600=50,32,16/1200=16,8
+num_jobs_final=8
+minibatch_size=150=128,64/300=64,32/600=32,16/1200=8
 common_egs_dir=
 l2_regularize=0.00005
-dim=450
 frames_per_iter=3000000
-cmvn_opts="--norm-means=true --norm-vars=true"
+cmvn_opts="--norm-means=false --norm-vars=false"
 train_set=train_si284_spe2e_hires
 test_sets="test_dev93 test_eval92"
 
@@ -69,7 +67,7 @@ fi
 
 lang=data/lang_e2e
 treedir=exp/chain/e2e_tree  # it's actually just a trivial tree (no tree building)
-dir=exp/chain/e2e_tdnn_${affix}
+dir=exp/chain/e2e_tdnnf_${affix}
 
 if [ $stage -le 0 ]; then
   # Create a version of the lang/ directory that has one state per phone in the
@@ -102,25 +100,35 @@ fi
 if [ $stage -le 2 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
   num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
-  opts="l2-regularize=0.01"
-  output_opts="l2-regularize=0.0025"
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
 
   input dim=40 name=input
 
-  relu-batchnorm-layer name=tdnn1 input=Append(-1,0,1) dim=$dim
-  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$dim $opts
-  relu-batchnorm-layer name=tdnn3 dim=$dim $opts
-  relu-batchnorm-layer name=tdnn4 input=Append(-1,0,1) dim=$dim $opts
-  relu-batchnorm-layer name=tdnn5 dim=$dim $opts
-  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$dim $opts
-  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$dim $opts
-  relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$dim $opts
-
-  relu-batchnorm-layer name=prefinal-chain dim=$dim target-rms=0.5 $opts
-  output-layer name=output include-log-softmax=true dim=$num_targets $output_opts
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-1,0,1) $tdnn_opts dim=1024
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
 
 EOF
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
@@ -139,14 +147,15 @@ if [ $stage -le 3 ]; then
     --egs.dir "$common_egs_dir" \
     --egs.stage $get_egs_stage \
     --egs.opts "" \
+    --trainer.dropout-schedule $dropout_schedule \
     --trainer.num-chunk-per-minibatch $minibatch_size \
     --trainer.frames-per-iter $frames_per_iter \
     --trainer.num-epochs $num_epochs \
     --trainer.optimization.momentum 0 \
     --trainer.optimization.num-jobs-initial $num_jobs_initial \
     --trainer.optimization.num-jobs-final $num_jobs_final \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.initial-effective-lrate 0.0005 \
+    --trainer.optimization.final-effective-lrate 0.00005 \
     --trainer.optimization.shrink-value 1.0 \
     --trainer.max-param-change 2.0 \
     --cleanup.remove-egs true \
diff --git a/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh b/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh
index 65a8cee6005..be82e80d5fe 100755
--- a/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh
+++ b/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh
@@ -6,31 +6,32 @@
 # a full trivial biphone context-dependency tree. This is because this recipe is
 # meant for character-based (i.e. lexicon-free) modeling where context helps
 # significantly.
-# It does not use ivecors or other forms of speaker adaptation
-# except simple mean and variance normalization.
+# It does not use ivecors or other forms of speaker adaptation.
 # It is called from run_e2e_char.sh
 
 # Note: this script is configured to run as character-based, if you want
 # to run it in phoneme mode, you'll need to change _char
-# to _nosp everywhere and also copy phone_lm.fst instead
-# of char_lm.fst (in stage 1 below)
+# to _nosp everywhere.
 
 
+# local/chain/compare_wer.sh exp/chain/e2e_tdnn_lstm_bichar_1a
 # System                e2e_tdnn_lstm_bichar_1a
-# WER dev93 (tgpr)                9.42
-# WER dev93 (tg)                  8.85
-# WER dev93 (big-dict,tgpr)       7.70
-# WER dev93 (big-dict,fg)         6.79
-# WER eval92 (tgpr)               6.42
-# WER eval92 (tg)                 6.11
-# WER eval92 (big-dict,tgpr)      4.50
-# WER eval92 (big-dict,fg)        4.09
-# Final train prob        -0.7535
-# Final valid prob        -0.7786
+#WER dev93 (tgpr)                9.85
+#WER dev93 (tg)                  9.32
+#WER dev93 (big-dict,tgpr)       8.19
+#WER dev93 (big-dict,fg)         7.27
+#WER eval92 (tgpr)               6.89
+#WER eval92 (tg)                 6.70
+#WER eval92 (big-dict,tgpr)      5.14
+#WER eval92 (big-dict,fg)        4.29
+# Final train prob        -0.0610
+# Final valid prob        -0.0836
+# Final train prob (xent)
+# Final valid prob (xent)
+# Num-params                 9219188
 
 # steps/info/chain_dir_info.pl exp/chain/e2e_tdnn_lstm_bichar_1a/
-# exp/chain/e2e_tdnn_lstm_bichar_1a/: num-iters=138 nj=2..5 num-params=9.2M dim=40->3444 combine=-6.480->-6.478 logprob:train/valid[91,137,final]=(-0.766,-0.754,-0.754/-0.784,-0.779,-0.779)
-
+# exp/chain/e2e_tdnn_lstm_bichar_1a_nocmvn: num-iters=138 nj=2..5 num-params=9.2M dim=40->3444 combine=-1.211->-1.211 (over 3) logprob:train/valid[91,137,final]=(-0.079,-0.062,-0.061/-0.093,-0.084,-0.084)
 
 set -e
 
@@ -50,7 +51,7 @@ common_egs_dir=
 l2_regularize=0.00001
 dim=512
 frames_per_iter=2500000
-cmvn_opts="--norm-means=true --norm-vars=true"
+cmvn_opts="--norm-means=false --norm-vars=false"
 train_set=train_si284_spe2e_hires
 test_sets="test_dev93 test_eval92"
 
diff --git a/egs/wsj/s5/local/chain/e2e/run_tdnnf_flatstart_char.sh b/egs/wsj/s5/local/chain/e2e/run_tdnnf_flatstart_char.sh
new file mode 120000
index 00000000000..b20849c2a48
--- /dev/null
+++ b/egs/wsj/s5/local/chain/e2e/run_tdnnf_flatstart_char.sh
@@ -0,0 +1 @@
+tuning/run_tdnnf_flatstart_char1b.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1a.sh b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1a.sh
new file mode 100755
index 00000000000..4ab0cf58d53
--- /dev/null
+++ b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1a.sh
@@ -0,0 +1,225 @@
+#!/bin/bash
+# Copyright    2017  Hossein Hadian
+
+# This script performs chain training in a flat-start manner
+# and without building or using any context-dependency tree.
+# It does not use ivecors or other forms of speaker adaptation
+# It is called from run_e2e_char.sh
+
+# Note: this script is configured as grapheme-based, if you want
+# to run it in phoneme mode, you'll need to change _char
+# to _nosp everywhere.
+
+# This is the same as run_tdnn_lstm_flatstart.sh except it uses
+# TDNN-F (and CMVN is disabled).
+
+
+# local/chain/compare_wer.sh exp/chain/e2e_tdnn_lstm_bichar_1a exp/chain/e2e_tdnnf_bichar1a
+# System                e2e_tdnn_lstm_bichar_1a e2e_tdnnf_bichar1a
+# WER dev93 (tgpr)                9.42      8.89
+# WER dev93 (tg)                  8.85      8.20
+# WER dev93 (big-dict,tgpr)       7.70      6.96
+# WER dev93 (big-dict,fg)         6.79      6.01
+# WER eval92 (tgpr)               6.42      6.08
+# WER eval92 (tg)                 6.11      5.79
+# WER eval92 (big-dict,tgpr)      4.50      4.39
+# WER eval92 (big-dict,fg)        4.09      3.88
+# Final train prob        -0.0610   -0.0598
+# Final valid prob        -0.0836   -0.0854
+# Final train prob (xent)
+# Final valid prob (xent)
+# Num-params                 9219188   7421044
+
+# steps/info/chain_dir_info.pl exp/chain/e2e_tdnnf_bichar1a
+# exp/chain/e2e_tdnnf_bichar1a: num-iters=180 nj=2..8 num-params=7.4M dim=40->3444 combine=-0.064->-0.064 (over 3) logprob:train/valid[119,179,final]=(-0.093,-0.060,-0.060/-0.107,-0.086,-0.085)
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+num_epochs=10
+num_jobs_initial=2
+num_jobs_final=8
+minibatch_size=150=128,64/300=64,32/600=32,16/1200=8
+common_egs_dir=
+l2_regularize=0.00005
+frames_per_iter=3000000
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train_si284_spe2e_hires
+test_sets="test_dev93 test_eval92"
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e_char
+treedir=exp/chain/e2e_bichar_tree
+dir=exp/chain/e2e_tdnnf_bichar${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang_char $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: Estimating a phone language model for the denominator graph..."
+  mkdir -p $treedir/log
+  $train_cmd $treedir/log/make_phone_lm.log \
+             cat data/$train_set/text \| \
+             steps/nnet3/chain/e2e/text_to_phones.py --between-silprob 0.1 \
+             data/lang_char \| \
+             utils/sym2int.pl -f 2- data/lang_char/phones.txt \| \
+             chain-est-phone-lm --num-extra-lm-states=2000 \
+             ark:- $treedir/phone_lm.fst
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$train_cmd" \
+                                       --type biphone \
+                                       --shared-phones true \
+                                       data/$train_set $lang $treedir
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+
+  input dim=40 name=input
+
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-1,0,1) $tdnn_opts dim=1024
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize $l2_regularize \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter $frames_per_iter \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate 0.0005 \
+    --trainer.optimization.final-effective-lrate 0.00005 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_char_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_char_test_tgpr \
+    $dir $treedir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_char_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_char_test_bd_tgpr \
+    $dir $treedir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  frames_per_chunk=150
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          $treedir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_char_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_char_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1b.sh b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1b.sh
new file mode 100755
index 00000000000..4e66fae8baa
--- /dev/null
+++ b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1b.sh
@@ -0,0 +1,227 @@
+#!/bin/bash
+# Copyright    2019  Hossein Hadian
+
+# 1b is the same as 1a except it uses a better tree (which is
+# pruned based on training transcripts).
+
+# This script performs chain training in a flat-start manner
+# and without building or using any context-dependency tree.
+# It does not use ivecors or other forms of speaker adaptation
+# It is called from run_e2e_char.sh
+
+# Note: this script is configured as grapheme-based, if you want
+# to run it in phoneme mode, you'll need to change _char
+# to _nosp everywhere.
+
+# local/chain/compare_wer.sh exp/chain/e2e_tdnnf_bichar1a exp/chain/e2e_tdnnf_bichar1b
+# System                e2e_tdnnf_bichar1a e2e_tdnnf_bichar1b
+#WER dev93 (tgpr)                8.89      9.06
+#WER dev93 (tg)                  8.20      8.43
+#WER dev93 (big-dict,tgpr)       6.96      6.95
+#WER dev93 (big-dict,fg)         6.01      6.08
+#WER eval92 (tgpr)               6.08      5.98
+#WER eval92 (tg)                 5.79      5.94
+#WER eval92 (big-dict,tgpr)      4.39      4.29
+#WER eval92 (big-dict,fg)        3.88      3.69
+# Final train prob        -0.0598   -0.0601
+# Final valid prob        -0.0854   -0.0855
+# Final train prob (xent)
+# Final valid prob (xent)
+# Num-params                 7421044   7025973
+
+# steps/info/chain_dir_info.pl exp/chain/e2e_tdnnf_bichar1b
+# exp/chain/e2e_tdnnf_bichar1b: num-iters=180 nj=2..8 num-params=7.0M dim=40->1397 combine=-0.064->-0.064 (over 2) logprob:train/valid[119,179,final]=(-0.086,-0.060,-0.060/-0.099,-0.087,-0.087)
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+affix=1b
+
+# training options
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+num_epochs=10
+num_jobs_initial=2
+num_jobs_final=8
+minibatch_size=150=128,64/300=64,32/600=32,16/1200=8
+common_egs_dir=
+l2_regularize=0.00005
+frames_per_iter=3000000
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train_si284_spe2e_hires
+test_sets="test_dev93 test_eval92"
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e_char
+treedir=exp/chain/e2e_bichar_tree_tied1a
+dir=exp/chain/e2e_tdnnf_bichar${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang_char $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: Estimating a phone language model for the denominator graph..."
+  mkdir -p $treedir/log
+  $train_cmd $treedir/log/make_phone_lm.log \
+             cat data/$train_set/text \| \
+             steps/nnet3/chain/e2e/text_to_phones.py --between-silprob 0.1 \
+             data/lang_char \| \
+             utils/sym2int.pl -f 2- data/lang_char/phones.txt \| \
+             chain-est-phone-lm --num-extra-lm-states=2000 \
+             ark:- $treedir/phone_lm.fst
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$train_cmd" \
+                                       --type biphone \
+                                       --shared-phones true \
+                                       --tie true \
+                                       --min-biphone-count 100 \
+                                       --min-monophone-count 20 \
+                                       data/$train_set $lang $treedir
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+
+  input dim=40 name=input
+
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-1,0,1) $tdnn_opts dim=1024
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize $l2_regularize \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter $frames_per_iter \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate 0.0005 \
+    --trainer.optimization.final-effective-lrate 0.00005 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_char_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_char_test_tgpr \
+    $dir $treedir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_char_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_char_test_bd_tgpr \
+    $dir $treedir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  frames_per_chunk=150
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          $treedir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_char_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_char_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
index 526059b7b90..8f566ccfe6d 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
@@ -160,7 +160,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python)
   tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
   linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
diff --git a/egs/wsj/s5/local/e2e/run_end2end_char.sh b/egs/wsj/s5/local/e2e/run_end2end_char.sh
index e5c84c405e2..ff44802f2be 100755
--- a/egs/wsj/s5/local/e2e/run_end2end_char.sh
+++ b/egs/wsj/s5/local/e2e/run_end2end_char.sh
@@ -56,6 +56,7 @@ if [ $stage -le 1 ]; then
   local/wsj_train_lms.sh --dict-suffix "_char"
   local/wsj_format_local_lms.sh --lang-suffix "_char"
   echo "$0: Done extending the vocabulary."
+  exit 0;
 fi
 
 if [ $stage -le 2 ]; then
@@ -102,5 +103,5 @@ fi
 
 if [ $stage -le 5 ]; then
   echo "$0: calling the flat-start chain recipe..."
-  local/chain/e2e/run_tdnn_lstm_flatstart.sh
+  local/chain/e2e/run_tdnnf_flatstart_char.sh
 fi
diff --git a/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py b/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
index 848ca61ebe4..d3e012da13c 100755
--- a/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
+++ b/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
@@ -127,7 +127,7 @@ def read_text(text_file):
                 "Did not get enough columns; line {0} in {1}"
                 "".format(line, text_file.name))
         elif len(parts) == 1:
-            logger.warn("Empty transcript for utterance %s in %s", 
+            logger.warn("Empty transcript for utterance %s in %s",
                         parts[0], text_file.name)
             yield parts[0], []
         else:
diff --git a/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py
index a19c5344572..3032a4b434a 100755
--- a/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py
+++ b/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2016   Vimal Manohar
 #           2016   Johns Hopkins University (author: Daniel Povey)
@@ -116,17 +116,17 @@
 def OpenFiles():
     global ctm_edits_out, edits_in, ctm_in, symbol_table, oov_word
     try:
-        ctm_edits_out = open(args.ctm_edits_out, 'w')
+        ctm_edits_out = open(args.ctm_edits_out, 'w', encoding='utf-8')
     except:
         sys.exit("get_ctm_edits.py: error opening ctm-edits file {0} for output".format(
                 args.ctm_edits_out))
     try:
-        edits_in = open(args.edits_in)
+        edits_in = open(args.edits_in, encoding='utf-8')
     except:
         sys.exit("get_ctm_edits.py: error opening edits file {0} for input".format(
                 args.edits_in))
     try:
-        ctm_in = open(args.ctm_in)
+        ctm_in = open(args.ctm_in, encoding='utf-8')
     except:
         sys.exit("get_ctm_edits.py: error opening ctm file {0} for input".format(
                 args.ctm_in))
@@ -138,7 +138,7 @@ def OpenFiles():
             print("get_ctm_edits.py: error: if you set the the --symbol-table option "
                   "you must also set the --oov option", file = sys.stderr)
         try:
-            f = open(args.symbol_table, 'r')
+            f = open(args.symbol_table, 'r', encoding='utf-8')
             for line in f.readlines():
                 [ word, integer ] = line.split()
                 if int(integer) == args.oov:
diff --git a/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py b/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py
index aa71fa47d84..69e0242eafb 100755
--- a/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py
+++ b/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2016   Vimal Manohar
 #           2016   Johns Hopkins University (author: Daniel Povey)
@@ -90,7 +90,7 @@ def read_lang(lang_dir):
         raise
 
     try:
-        for line in open(lang_dir + '/words.txt').readlines():
+        for line in open(lang_dir + '/words.txt', encoding='utf-8').readlines():
             [ word, integer ] = line.split()
             if int(integer) in silence_word_ints:
                 non_scored_words.add(word)
diff --git a/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py b/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py
index a33ba85d9fa..3ea217b6589 100755
--- a/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py
+++ b/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py
@@ -75,14 +75,14 @@ def ReadEntries(file_handle):
 # Each entry in the list represents the pronounciation candidate(s) of a word.
 # For each non-<eps> word, the entry is a list: [utt_id, word, set(pronunciation_candidates)]. e.g:
 # [911Mothers_2010W-0010916-0012901-1, other, set('AH DH ER', 'AH DH ER K AH N')]
-# For each <eps>, we split the phones it aligns to into two parts: "nonsil_left", 
+# For each <eps>, we split the phones it aligns to into two parts: "nonsil_left",
 # which includes phones before the first silphone, and "nonsil_right", which includes
-# phones after the last silphone. For example, for <eps> : 'V SIL B AH SIL', 
+# phones after the last silphone. For example, for <eps> : 'V SIL B AH SIL',
 # nonsil_left is 'V' and nonsil_right is empty ''. After processing an <eps> entry
 # in ctm_prons, we put it in "info" as an entry:  [utt_id, word, nonsil_right]
 # only if it's nonsil_right segment is not empty, which may be used when processing
 # the next word.
-# 
+#
 # Normally, one non-<eps> word is only aligned to one pronounciation candidate. However
 # when there is a preceding/following <eps>, like in the following example, we
 # assume the phones aligned to <eps> should be statistically distributed
@@ -90,7 +90,7 @@ def ReadEntries(file_handle):
 # Thus we append the "nonsil_left" segment of these phones to the pronounciation
 # of the preceding word, if the last phone of this pronounciation is not a silence phone,
 # Similarly we can add a pron candidate to the following word.
-# 
+#
 # For example, for the following part of a ctm_prons file:
 # 911Mothers_2010W-0010916-0012901-1 other AH DH ER
 # 911Mothers_2010W-0010916-0012901-1 <eps> K AH N SIL B
@@ -99,11 +99,11 @@ def ReadEntries(file_handle):
 # 911Mothers_2010W-0010916-0012901-1 when W EH N
 # 911Mothers_2010W-0010916-0012901-1 people P IY P AH L
 # 911Mothers_2010W-0010916-0012901-1 <eps> SIL
-# 911Mothers_2010W-0010916-0012901-1 heard HH ER 
+# 911Mothers_2010W-0010916-0012901-1 heard HH ER
 # 911Mothers_2010W-0010916-0012901-1 <eps> D
 # 911Mothers_2010W-0010916-0012901-1 that SIL DH AH T
 # 911Mothers_2010W-0010916-0012901-1 my M AY
-# 
+#
 # The corresponding segment in the "info" list is:
 # [911Mothers_2010W-0010916-0012901-1, other, set('AH DH ER', 'AH DH ER K AH N')]
 # [911Mothers_2010W-0010916-0012901-1, <eps>, 'B'
@@ -113,7 +113,7 @@ def ReadEntries(file_handle):
 # [911Mothers_2010W-0010916-0012901-1, <eps>, 'D']
 # [911Mothers_2010W-0010916-0012901-1, that, set('SIL DH AH T')]
 # [911Mothers_2010W-0010916-0012901-1, my, set('M AY')]
-# 
+#
 # Then we accumulate pronouciation stats from "info". Basically, for each occurence
 # of a word, each pronounciation candidate gets equal soft counts. e.g. In the above
 # example, each pron candidate of "because" gets a count of 1/4. The stats is stored
@@ -139,20 +139,20 @@ def GetStatsFromCtmProns(silphones, optional_silence, non_scored_words, ctm_pron
         # So we apply the same merging method in these cases.
         if word == '<eps>' or (word in non_scored_words and word != '<unk>' and word != '<UNK>'):
             nonsil_left = []
-            nonsil_right = [] 
+            nonsil_right = []
             for phone in phones:
                 if phone in silphones:
                     break
                 nonsil_left.append(phone)
-            
+
             for phone in reversed(phones):
                 if phone in silphones:
                     break
                 nonsil_right.insert(0, phone)
-            
+
             # info[-1][0] is the utt_id of the last entry
-            if len(nonsil_left) > 0 and len(info) > 0 and utt == info[-1][0]: 
-                # pron_ext is a set of extended pron candidates. 
+            if len(nonsil_left) > 0 and len(info) > 0 and utt == info[-1][0]:
+                # pron_ext is a set of extended pron candidates.
                 pron_ext = set()
                 # info[-1][2] is the set of pron candidates of the last entry.
                 for pron in info[-1][2]:
@@ -211,7 +211,7 @@ def GetStatsFromCtmProns(silphones, optional_silence, non_scored_words, ctm_pron
                 stats[(word, phones)] = stats.get((word, phones), 0) + count
     return stats
 
-def WriteStats(stats, file_handle):            
+def WriteStats(stats, file_handle):
     for word_pron, count in stats.items():
         print('{0} {1} {2}'.format(count, word_pron[0], word_pron[1]), file=file_handle)
     file_handle.close()
@@ -222,7 +222,7 @@ def Main():
     non_scored_words = ReadEntries(args.non_scored_words_file_handle)
     optional_silence = ReadEntries(args.optional_silence_file_handle)
     stats = GetStatsFromCtmProns(silphones, optional_silence.pop(), non_scored_words, args.ctm_prons_file_handle)
-    WriteStats(stats, args.stats_file_handle)            
+    WriteStats(stats, args.stats_file_handle)
 
 if __name__ == "__main__":
     Main()
diff --git a/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py b/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py
index e41a67705e9..68055729fd9 100755
--- a/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py
+++ b/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0.
@@ -142,16 +142,18 @@ def CompletelyDiscountLowCountStates(self, min_count):
         hist_to_total_count = self.GetHistToTotalCount()
         for n in reversed(list(range(2, self.ngram_order))):
             this_order_counts = self.counts[n]
+            to_delete = []
             for hist in this_order_counts.keys():
                 if hist_to_total_count[hist] < min_count:
                     # we need to completely back off this count.
                     word_to_count = this_order_counts[hist]
-                    del this_order_counts[hist] # delete the key from the dict.
+                    # mark this key for deleting
+                    to_delete.append(hist)
                     backoff_hist = hist[1:]  # this will be a tuple not a list.
                     for word, count in word_to_count.items():
                         self.AddCount(backoff_hist, word, count)
-
-
+            for hist in to_delete:
+                del this_order_counts[hist]
 
     # This backs off the counts according to Kneser-Ney (unmodified,
     # with interpolation).
@@ -200,7 +202,7 @@ def AddTopWords(self, top_words_file):
         word_to_count = self.counts[0][empty_history]
         total = sum(word_to_count.values())
         try:
-            f = open(top_words_file)
+            f = open(top_words_file, mode='r', encoding='utf-8')
         except:
             sys.exit("make_one_biased_lm.py: error opening top-words file: "
                      "--top-words=" + top_words_file)
diff --git a/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py
index d6f0d0f6b23..af63ca27d2b 100755
--- a/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py
+++ b/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2016   Vimal Manohar
 #           2016   Johns Hopkins University (author: Daniel Povey)
@@ -105,7 +105,7 @@
 def ReadNonScoredWords(non_scored_words_file):
     global non_scored_words
     try:
-        f = open(non_scored_words_file)
+        f = open(non_scored_words_file, encoding='utf-8')
     except:
         sys.exit("modify_ctm_edits.py: error opening file: "
                  "--non-scored-words=" + non_scored_words_file)
@@ -317,12 +317,12 @@ def ProcessUtterance(split_lines_of_utt):
 
 def ProcessData():
     try:
-        f_in = open(args.ctm_edits_in)
+        f_in = open(args.ctm_edits_in, encoding='utf-8')
     except:
         sys.exit("modify_ctm_edits.py: error opening ctm-edits input "
                  "file {0}".format(args.ctm_edits_in))
     try:
-        f_out = open(args.ctm_edits_out, 'w')
+        f_out = open(args.ctm_edits_out, 'w', encoding='utf-8')
     except:
         sys.exit("modify_ctm_edits.py: error opening ctm-edits output "
                  "file {0}".format(args.ctm_edits_out))
diff --git a/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py b/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
index 1dae735304f..a123b13f532 100755
--- a/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
+++ b/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
@@ -229,7 +229,7 @@ def resolve_overlaps(ctm_edits, segments):
             try:
                 cur_utt_end_index = next(
                     (i for i, line in enumerate(ctm_edits_for_cur_utt)
-                     if line[2] + line[3] / 2.0)> window_length - overlap))
+                     if line[2] + line[3] / 2.0 > window_length - overlap))
             except StopIteration:
                 cur_utt_end_index = len(ctm_edits_for_cur_utt)
 
diff --git a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
index 39d6cb6ed80..e571fefb84c 100755
--- a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
+++ b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
@@ -1,4 +1,5 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
+
 
 # Copyright 2016   Vimal Manohar
 #           2016   Johns Hopkins University (author: Daniel Povey)
@@ -894,7 +895,7 @@ def AccWordStatsForUtterance(split_lines_of_utt,
 
 def PrintWordStats(word_stats_out):
     try:
-        f = open(word_stats_out, 'w')
+        f = open(word_stats_out, 'w', encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening word-stats file --word-stats-out={0} "
                  "for writing".format(word_stats_out))
@@ -924,23 +925,23 @@ def PrintWordStats(word_stats_out):
 
 def ProcessData():
     try:
-        f_in = open(args.ctm_edits_in)
+        f_in = open(args.ctm_edits_in, encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening ctm-edits input "
                  "file {0}".format(args.ctm_edits_in))
     try:
-        text_output_handle = open(args.text_out, 'w')
+        text_output_handle = open(args.text_out, 'w', encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening text output "
                  "file {0}".format(args.text_out))
     try:
-        segments_output_handle = open(args.segments_out, 'w')
+        segments_output_handle = open(args.segments_out, 'w', encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening segments output "
                  "file {0}".format(args.text_out))
     if args.ctm_edits_out != None:
         try:
-            ctm_edits_output_handle = open(args.ctm_edits_out, 'w')
+            ctm_edits_output_handle = open(args.ctm_edits_out, 'w', encoding='utf-8')
         except:
             sys.exit("segment_ctm_edits.py: error opening ctm-edits output "
                      "file {0}".format(args.ctm_edits_out))
@@ -994,7 +995,7 @@ def ProcessData():
 def ReadNonScoredWords(non_scored_words_file):
     global non_scored_words
     try:
-        f = open(non_scored_words_file)
+        f = open(non_scored_words_file, encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening file: "
                  "--non-scored-words=" + non_scored_words_file)
@@ -1015,7 +1016,7 @@ def ReadNonScoredWords(non_scored_words_file):
 oov_symbol = None
 if args.oov_symbol_file != None:
     try:
-        with open(args.oov_symbol_file) as f:
+        with open(args.oov_symbol_file, encoding='utf-8') as f:
             line = f.readline()
             assert len(line.split()) == 1
             oov_symbol = line.split()[0]
diff --git a/egs/wsj/s5/steps/cleanup/make_biased_lms.py b/egs/wsj/s5/steps/cleanup/make_biased_lms.py
index ab508eedc9c..4b1fd320221 100755
--- a/egs/wsj/s5/steps/cleanup/make_biased_lms.py
+++ b/egs/wsj/s5/steps/cleanup/make_biased_lms.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 from __future__ import print_function
 import sys
@@ -55,7 +55,7 @@ def ProcessGroupOfLines(group_of_lines):
     try:
         command = "steps/cleanup/internal/make_one_biased_lm.py " + args.lm_opts
         p = subprocess.Popen(command, shell = True, stdin = subprocess.PIPE,
-                            stdout = sys.stdout, stderr = sys.stderr)
+                             stdout = sys.stdout, stderr = sys.stderr)
         for line in group_of_lines:
             a = line.split()
             if len(a) == 0:
@@ -63,13 +63,15 @@ def ProcessGroupOfLines(group_of_lines):
             utterance_id = a[0]
             # print <utt> <utt-group> to utterance-map file
             print(utterance_id, group_utterance_id, file = utterance_map_file)
-            rest_of_line = ' '.join(a[1:])  # get rid of utterance id.
-            print(rest_of_line, file=p.stdin)
+            rest_of_line = ' '.join(a[1:]) + '\n' # get rid of utterance id.
+            p.stdin.write(rest_of_line.encode('utf-8'))
         p.stdin.close()
         assert p.wait() == 0
-    except Exception as e:
-        sys.exit("make_biased_lms.py: error calling subprocess, command was: " +
-                 command + ", error was : " + str(e))
+    except Exception:
+        sys.stderr.write(
+            "make_biased_lms.py: error calling subprocess, command was: " +
+            command)
+        raise
     # Print a blank line; this terminates the FST in the Kaldi fst-archive
     # format.
     print("")
diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
index ae355c9f753..751200bdf83 100755
--- a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
+++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
@@ -4,7 +4,8 @@
 #           2016  Vimal Manohar
 # Apache 2.0
 
-# This script is similar to steps/cleanup/segment_long_utterances.sh, but 
+
+# This script is similar to steps/cleanup/segment_long_utterances.sh, but
 # uses nnet3 acoustic model instead of GMM acoustic model for decoding.
 # This script performs segmentation of the input data based on the transcription
 # and outputs segmented data along with the corresponding aligned transcription.
@@ -13,7 +14,7 @@
 # are of manageable length for further processing, along with the portion of the
 # transcript that seems to match (aligns with) each segment.
 # This the light-supervised training scenario where the input transcription is
-# not expected to be completely clean and may have significant errors. 
+# not expected to be completely clean and may have significant errors.
 # See "JHU Kaldi System for Arabic MGB-3 ASR Challenge using Diarization,
 # Audio-transcript Alignment and Transfer Learning": Vimal Manohar, Daniel
 # Povey, Sanjeev Khudanpur, ASRU 2017
@@ -39,24 +40,22 @@ seconds_per_spk_max=30
 
 # Decode options
 graph_opts=
+scale_opts=  # for making the graphs
 beam=15.0
 lattice_beam=1.0
 lmwt=10
-
 acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
-post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
-                      # regular scoring script works.
 
 # Contexts must ideally match training
 extra_left_context=0  # Set to some large value, typically 40 for LSTM (must match training)
-extra_right_context=0  
+extra_right_context=0
 extra_left_context_initial=-1
 extra_right_context_final=-1
 frames_per_chunk=150
 
 # i-vector options
-extractor=    # i-Vector extractor. If provided, will extract i-vectors. 
-              # Required if the network was trained with i-vector extractor. 
+extractor=    # i-Vector extractor. If provided, will extract i-vectors.
+              # Required if the network was trained with i-vector extractor.
 use_vad=false # Use energy-based VAD for i-vector extraction
 
 # TF-IDF similarity search options
@@ -116,12 +115,12 @@ it and eliminate data where the transcript doesn't seem to match.
     --segmentation-extra-opts 'opts'  # Additional options to segment_ctm_edits_mild.py.
                                 # Please run steps/cleanup/internal/segment_ctm_edits_mild.py
                                 # without arguments to see allowed options.
-    --align-full-hyp <true|false>  # If true, align full hypothesis 
-                                   i.e. trackback from the end to get the alignment. 
-                                   This is different from the normal 
+    --align-full-hyp <true|false>  # If true, align full hypothesis
+                                   i.e. trackback from the end to get the alignment.
+                                   This is different from the normal
                                    Smith-Waterman alignment, where the
                                    traceback will be from the maximum score.
-    --extractor <extractor>     # i-vector extractor directory if i-vector is 
+    --extractor <extractor>     # i-vector extractor directory if i-vector is
                                 # to be used during decoding. Must match
                                 # the extractor used for training neural-network.
     --use-vad <true|false>      # If true, uses energy-based VAD to apply frame weights
@@ -168,6 +167,23 @@ cp $srcdir/cmvn_opts $dir
 cp $srcdir/{splice_opts,delta_opts,final.mat,final.alimdl} $dir 2>/dev/null || true
 cp $srcdir/frame_subsampling_factor $dir 2>/dev/null || true
 
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  echo "$0: guessing that this is a chain system, checking parameters."
+  if [ -z $scale_opts ]; then
+    echo "$0: setting scale_opts"
+    scale_opts="--self-loop-scale=1.0 --transition-scale=1.0"
+  fi
+  if [ $acwt == 0.1 ]; then
+    echo "$0: setting acwt=1.0"
+    acwt=1.0
+  fi
+  if [ $lmwt == 10 ]; then
+    echo "$0: setting lmwt=1.0"
+    lmwt=1
+  fi
+fi
+
+
 utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt
 cp $lang/phones.txt $dir
 
@@ -221,6 +237,7 @@ if [ $stage -le 3 ]; then
 
   # Make graphs w.r.t. to the original text (usually recording-level)
   steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
+    --scale-opts "$scale_opts" \
     --nj $nj --cmd "$cmd" $text \
     $lang $dir $dir/graphs
   if [ -z "$utt2text" ]; then
@@ -267,7 +284,7 @@ if [ $stage -le 5 ]; then
   echo "$0: Decoding with biased language models..."
 
   steps/cleanup/decode_segmentation_nnet3.sh \
-    --acwt $acwt --post-decode-acwt $post_decode_acwt \
+    --acwt $acwt \
     --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \
     --skip-scoring true --allow-partial false \
     --extra-left-context $extra_left_context \
diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py
index 189f4619ddb..b1745a4b723 100755
--- a/egs/wsj/s5/steps/data/reverberate_data_dir.py
+++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py
@@ -371,8 +371,8 @@ def GenerateReverberatedWavScp(wav_scp,  # a dictionary whose values are the Kal
 
 # This function replicate the entries in files like segments, utt2spk, text
 def AddPrefixToFields(input_file, output_file, num_replicas, include_original, prefix, field = [0]):
-    list = [x.strip() for x in open(input_file)]
-    f = open(output_file, "w")
+    list = [x.strip() for x in open(input_file, encoding='utf-8')]
+    f = open(output_file, "w" ,encoding='utf-8')
     if include_original:
         start_index = 0
     else:
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py
index 5597ff0e216..1628a5e314f 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py
@@ -149,7 +149,7 @@ def set_derived_configs(self):
         if input_dim % height_in != 0:
             raise RuntimeError("Input dimension {0} is not a multiple of height-in={1}".format(
                 input_dim, height_in))
-        self.config['num-filters-in'] = input_dim / height_in
+        self.config['num-filters-in'] = input_dim // height_in
 
 
     # Check whether 'str' is a sorted, unique, nonempty list of integers, like -1,0,1.,
diff --git a/egs/wsj/s5/steps/make_mfcc.sh b/egs/wsj/s5/steps/make_mfcc.sh
index c88e0d65e65..8514ce4e38d 100755
--- a/egs/wsj/s5/steps/make_mfcc.sh
+++ b/egs/wsj/s5/steps/make_mfcc.sh
@@ -75,6 +75,8 @@ if [ -f $data/spk2warp ]; then
 elif [ -f $data/utt2warp ]; then
   echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
   vtln_opts="--vtln-map=ark:$data/utt2warp"
+else
+  vtln_opts=""
 fi
 
 for n in $(seq $nj); do
diff --git a/egs/wsj/s5/steps/nnet3/chain/e2e/compute_biphone_stats.py b/egs/wsj/s5/steps/nnet3/chain/e2e/compute_biphone_stats.py
new file mode 100755
index 00000000000..e009cc17a9b
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/e2e/compute_biphone_stats.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+
+# Copyright    2018 Hossein Hadian
+# Apache 2.0
+
+import argparse
+from os.path import join
+import sys
+import copy
+import random
+
+parser = argparse.ArgumentParser(description="""This script reads
+    sequences of phone ids from std input and counts mono/biphone stats
+    and writes the results to std out. The output can be used with
+    gmm-init-biphone to create a better tree. The first part of the
+    outupt is biphone counts with this format for each line:
+    <phone-id> <phone-id> <count>
+    and the second part of the output is monophone counts with the
+    following format:
+    <phone-id> <count>""")
+parser.add_argument('langdir', type=str)
+parser.add_argument('--shared-phones', type=str, choices=['true','false'],
+                    default='true',
+                    help="If true, stats will be collected for shared phones.")
+
+args = parser.parse_args()
+args.shared_phones = True if args.shared_phones == 'true' else False
+
+# Read phone sets
+phone_sets = []
+phones = []
+phone_to_shard_phone = {}
+phone_to_shard_phone[0] = 0  # The no-left-context case
+with open(join(args.langdir, 'phones/sets.int'), 'r', encoding='latin-1') as f:
+    for line in f:
+        phone_set = line.strip().split()
+        phone_sets.append(phone_set)
+        for phone in phone_set:
+            phones.append(phone)
+            phone_to_shard_phone[phone] = phone_set[0]
+
+print('Loaded {} phone-sets containing {} phones.'.format(len(phone_sets),
+                                                          len(phones)),
+      file=sys.stderr)
+
+biphone_counts = {}
+mono_counts = {}
+for line in sys.stdin:
+    line = line.strip().split()
+    key = line[0]
+    line_phones = line[1:]
+    for pair in zip([0] + line_phones, line_phones):  # 0 is for the no left-context case
+        if args.shared_phones:
+            pair = (phone_to_shard_phone[pair[0]], phone_to_shard_phone[pair[1]])
+        if pair not in biphone_counts:
+            biphone_counts[pair] = 0
+        biphone_counts[pair] += 1
+        mono_counts[pair[1]] = 1 if pair[1] not in mono_counts else mono_counts[pair[1]] + 1
+
+for phone1 in [0] + phones:
+    for phone2 in phones:
+        pair = (phone1, phone2)
+        shared_pair = ((phone_to_shard_phone[pair[0]], phone_to_shard_phone[pair[1]])
+                       if args.shared_phones else pair)
+        count = biphone_counts[shared_pair] if shared_pair in biphone_counts else 0
+        if count != 0:
+            print('{} {} {}'.format(pair[0], pair[1], count))
+for phone in phones:
+    shared = phone_to_shard_phone[phone] if args.shared_phones else phone
+    count = mono_counts[shared] if shared in mono_counts else 0
+    if count != 0:
+        print('{} {}'.format(phone, count))
diff --git a/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh b/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh
index c211381bf8b..07d5ee8cfb8 100755
--- a/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh
@@ -14,13 +14,23 @@ cmd=run.pl
 nj=4
 stage=0
 shared_phones=true
-treedir=              # if specified, the tree and model will be copied from there
+treedir=              # If specified, the tree and model will be copied from there
                       # note that it may not be flat start anymore.
-type=mono             # can be either mono or biphone -- either way
+type=mono             # Can be either mono or biphone -- either way
                       # the resulting tree is full (i.e. it doesn't do any tying)
-ci_silence=false      # if true, silence phones will be treated as context independent
+ci_silence=false      # If true, silence phones will be treated as context independent
 
 scale_opts="--transition-scale=0.0 --self-loop-scale=0.0"
+tie=false             # If true, gmm-init-biphone will do some tying when
+                      # creating the full biphone tree (it won't be full anymore).
+                      # Specifically, it will revert to monophone if the data
+                      # counts for a biphone are smaller than min_biphone_count.
+                      # If the monophone count is also smaller than min_monophone_count,
+                      # it will revert to a shared global phone. Note that this
+                      # only affects biphone models (i.e., type=biphone) which
+                      # use the special chain topology.
+min_biphone_count=100
+min_monophone_count=20
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -35,6 +45,7 @@ if [ $# != 3 ]; then
   echo "  --config <config-file>                           # config containing options"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --type <mono | biphone>                          # context dependency type"
+  echo "  --tie <true | false>                             # enable/disable count-based tying"
   exit 1;
 fi
 
@@ -69,12 +80,23 @@ if $ci_silence; then
   ci_opt="--ci-phones=$ciphonelist"
 fi
 
+tie_opts=
+if $tie && [[ "$type" = "biphone" ]]; then
+  cat $data/text | steps/chain/e2e/text_to_phones.py --edge-silprob 0 \
+                                                     --between-silprob 0 \
+                                                     $lang | \
+    cut -d' ' -f 2- | utils/sym2int.pl $lang/phones.txt | \
+    steps/chain/e2e/compute_biphone_stats.py $lang >$dir/phone-stats.txt
+  tie_opts="--min-biphone-count=$min_biphone_count \
+--min-monophone-count=$min_monophone_count --phone-counts=$dir/phone-stats.txt"
+fi
+
 if [ $stage -le 0 ]; then
   if [ -z $treedir ]; then
     echo "$0: Initializing $type system."
     # feat dim does not matter here. Just set it to 10
     $cmd $dir/log/init_${type}_mdl_tree.log \
-         gmm-init-$type $ci_opt $shared_phones_opt $lang/topo 10 \
+         gmm-init-$type $tie_opts $ci_opt $shared_phones_opt $lang/topo 10 \
          $dir/0.mdl $dir/tree || exit 1;
   else
     echo "$0: Copied tree/mdl from $treedir." >$dir/log/init_mdl_tree.log
diff --git a/egs/wsj/s5/steps/nnet3/convert_nnet2_to_nnet3.py b/egs/wsj/s5/steps/nnet3/convert_nnet2_to_nnet3.py
index 66ff633fbfc..edc2f7e4617 100755
--- a/egs/wsj/s5/steps/nnet3/convert_nnet2_to_nnet3.py
+++ b/egs/wsj/s5/steps/nnet3/convert_nnet2_to_nnet3.py
@@ -100,6 +100,7 @@ class Nnet3Model(object):
     def __init__(self):
         self.input_dim = -1
         self.output_dim = -1
+        self.ivector_dim = -1
         self.counts = defaultdict(int)
         self.num_components = 0
         self.components_read = 0
@@ -118,7 +119,10 @@ def add_component(self, component, pairs):
         Component = namedtuple("Component", "ident component pairs")
 
         if "<InputDim>" in pairs and self.input_dim == -1:
-            self.input_dim = pairs["<InputDim>"]
+            self.input_dim = int(pairs["<InputDim>"])
+
+        if "<ConstComponentDim>" in pairs and self.ivector_dim == -1:
+            self.ivector_dim = int(pairs["<ConstComponentDim>"])
 
         # remove nnet2 specific tokens and catch descriptors
         if component == "<PnormComponent>" and "<P>" in pairs:
@@ -159,13 +163,18 @@ def write_config(self, filename):
                                     config_string=config_string))
 
             f.write("\n# Component nodes\n")
-            f.write("input-node name=input dim={0}\n".format(self.input_dim))
+            if self.ivector_dim != -1:
+                f.write("input-node name=input dim={0}\n".format(self.input_dim-self.ivector_dim))
+                f.write("input-node name=ivector dim={0}\n".format(self.ivector_dim))
+            else:
+                f.write("input-node name=input dim={0}\n".format(self.input_dim))
             previous_component = "input"
             for component in self.components:
                 if component.ident == "splice":
                     # Create splice string for the next node
                     previous_component = make_splice_string(previous_component, 
-                                                   component.pairs["<Context>"])
+                                                   component.pairs["<Context>"],
+                                                   component.pairs["<ConstComponentDim>"])
                     continue
                 f.write("component-node name={name} component={name} "
                         "input={inp}\n".format(name=component.ident, 
@@ -264,7 +273,7 @@ def parse_component(line, line_buffer):
     pairs = {}
 
     if component in SPLICE_COMPONENTS:
-        pairs = parse_splice_component(component, line, line_buffer)
+        line, pairs = parse_splice_component(component, line, line_buffer)
     elif component in AFFINE_COMPONENTS:
         pairs = parse_affine_component(component, line, line_buffer)
     elif component == "<FixedScaleComponent>":
@@ -335,7 +344,13 @@ def parse_splice_component(component, line, line_buffer):
     line = consume_token("<Context>", line)
     context = line.strip()[1:-1].split()
 
-    return {"<InputDim>" : input_dim, "<Context>" : context}
+    const_component_dim = 0
+    line = next(line_buffer) # Context vector adds newline
+    line = consume_token("<ConstComponentDim>", line)
+    const_component_dim = int(line.strip().split()[0])
+
+    return line, {"<InputDim>" : input_dim, "<Context>" : context, 
+            "<ConstComponentDim>" : const_component_dim}
 
 def parse_end_of_component(component, line, line_buffer):
     # Keeps reading until it hits the end tag for component
@@ -422,7 +437,7 @@ def consume_token(token, line):
 
     return line.partition(token)[2]
 
-def make_splice_string(nodename, context):
+def make_splice_string(nodename, context, const_component_dim=0):
     """Generates splice string from a list of context.
 
     E.g. make_splice_string("renorm4", [-4, 4])
@@ -430,6 +445,8 @@ def make_splice_string(nodename, context):
     """
     assert type(context) == list, "context argument must be a list"
     string = ["Offset({0}, {1})".format(nodename, i) for i in context]
+    if const_component_dim > 0:
+        string.append("ReplaceIndex(ivector, t, 0)")
     string = "Append(" + ", ".join(string) + ")"
     return string
 
diff --git a/egs/wsj/s5/steps/nnet3/get_degs.sh b/egs/wsj/s5/steps/nnet3/get_degs.sh
index 8098b59c4ad..7853daa4563 100755
--- a/egs/wsj/s5/steps/nnet3/get_degs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_degs.sh
@@ -471,7 +471,6 @@ if [ $stage -le 10 ] && $cleanup; then
 fi
 
 
-exit 0
-
-
 echo "$0: Finished decoding and preparing training examples"
+
+exit 0
diff --git a/egs/wsj/s5/steps/segmentation/internal/merge_targets.py b/egs/wsj/s5/steps/segmentation/internal/merge_targets.py
index a14aef151c2..84b0c884f45 100755
--- a/egs/wsj/s5/steps/segmentation/internal/merge_targets.py
+++ b/egs/wsj/s5/steps/segmentation/internal/merge_targets.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2017  Vimal Manohar
 # Apache 2.0
@@ -16,8 +16,6 @@
 option.
 """
 
-from __future__ import print_function
-from __future__ import division
 import argparse
 import logging
 import numpy as np
@@ -111,7 +109,7 @@ def should_remove_frame(row, dim):
                                      # source[2] = [ 0 0 0 ]
     """
     assert len(row) % dim == 0
-    num_sources = len(row) / dim
+    num_sources = len(row) // dim
 
     max_idx = np.argmax(row)
     max_val = row[max_idx]
diff --git a/egs/wsj/s5/steps/train_mono.sh b/egs/wsj/s5/steps/train_mono.sh
index 141d128c329..5a0b79a4a1c 100755
--- a/egs/wsj/s5/steps/train_mono.sh
+++ b/egs/wsj/s5/steps/train_mono.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 # Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+#           2019  Xiaohui Zhang
 # Apache 2.0
 
 
@@ -13,6 +14,9 @@ cmd=run.pl
 scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
 num_iters=40    # Number of iterations of training
 max_iter_inc=30 # Last iter to increase #Gauss on.
+initial_beam=6 # beam used in the first iteration (set smaller to speed up initialization)
+regular_beam=10 # beam used after the first iteration
+retry_beam=40
 totgauss=1000 # Target #Gaussians.
 careful=false
 boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
@@ -105,8 +109,7 @@ if [ $stage -le 0 ]; then
   rm $dir/0.*.acc
 fi
 
-
-beam=6 # will change to 10 below after 1st pass
+beam=$initial_beam # will change to regular_beam below after 1st pass
 # note: using slightly wider beams for WSJ vs. RM.
 x=1
 while [ $x -lt $num_iters ]; do
@@ -116,7 +119,7 @@ while [ $x -lt $num_iters ]; do
       echo "$0: Aligning data"
       mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
       $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
-        gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$[$beam*4] --careful=$careful "$mdl" \
+        gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \
         "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" \
         || exit 1;
     fi
@@ -132,7 +135,7 @@ while [ $x -lt $num_iters ]; do
   if [ $x -le $max_iter_inc ]; then
      numgauss=$[$numgauss+$incgauss];
   fi
-  beam=10
+  beam=$regular_beam
   x=$[$x+1]
 done
 
diff --git a/egs/wsj/s5/utils/add_lex_disambig.pl b/egs/wsj/s5/utils/add_lex_disambig.pl
index dd8a25de6e1..c4277e8dc06 100755
--- a/egs/wsj/s5/utils/add_lex_disambig.pl
+++ b/egs/wsj/s5/utils/add_lex_disambig.pl
@@ -122,6 +122,7 @@
     if ($sil_probs) {
       shift @A; # Remove silprob
       shift @A; # Remove silprob
+      shift @A; # Remove silprob, there three numbers for sil_probs
     }
     while(@A > 0) {
         pop @A;  # Remove last phone
diff --git a/egs/wsj/s5/utils/data/get_utt2num_frames.sh b/egs/wsj/s5/utils/data/get_utt2num_frames.sh
index a6d4f0ecb10..d8b006a5fc0 100755
--- a/egs/wsj/s5/utils/data/get_utt2num_frames.sh
+++ b/egs/wsj/s5/utils/data/get_utt2num_frames.sh
@@ -10,13 +10,14 @@ frame_shift=0.01
 frame_overlap=0.015
 
 . utils/parse_options.sh
+. ./path.sh
 
 if [ $# -ne 1 ]; then
   echo "This script writes a file utt2num_frames with the "
   echo "number of frames in each utterance as measured based on the "
   echo "duration of the utterances (in utt2dur) and the specified "
   echo "frame_shift and frame_overlap."
-  echo "Usage: $0 <data>" 
+  echo "Usage: $0 <data>"
   exit 1
 fi
 
diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh b/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh
index dae440b03a3..e357ba8cbfb 100755
--- a/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh
+++ b/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh
@@ -52,15 +52,15 @@ for line in sys.stdin.readlines():
   parts = line.strip().split()
   if line.strip()[-1] == '|':
     if re.search('sox --vol', ' '.join(parts[-11:])):
-      print 'true'
+      print('true')
       sys.exit(0)
   elif re.search(':[0-9]+$', line.strip()) is not None:
     continue
   else:
     if ' '.join(parts[1:3]) == 'sox --vol':
-      print 'true'
+      print('true')
       sys.exit(0)
-print 'false'
+print('false')
 "` || exit 1
 
 if $volume_perturb_done; then
diff --git a/egs/wsj/s5/utils/parallel/limit_num_gpus.sh b/egs/wsj/s5/utils/parallel/limit_num_gpus.sh
index d9707a816c4..9d7caddd1f6 100755
--- a/egs/wsj/s5/utils/parallel/limit_num_gpus.sh
+++ b/egs/wsj/s5/utils/parallel/limit_num_gpus.sh
@@ -18,8 +18,8 @@ if [ "$1" == "--num-gpus" ]; then
   shift
 fi
 
-if ! printf "%d" "$num_gpus" >/dev/null || [ $num_gpus -le 0 ]; then
-  echo $0: Must pass a positive interger after --num-gpus
+if ! printf "%d" "$num_gpus" >/dev/null || [ $num_gpus -le -1 ]; then
+  echo $0: Must pass a positive interger or 0 after --num-gpus
   echo e.g. $0 --num-gpus 2 local/tfrnnlm/run_lstm.sh
   exit 1
 fi
@@ -35,18 +35,24 @@ CUDA_VISIBLE_DEVICES=
 num_total_gpus=`nvidia-smi -L | wc -l`
 num_gpus_assigned=0
 
-for i in `seq 0 $[$num_total_gpus-1]`; do
-# going over all GPUs and check if it is idle, and add to the list if yes
-  if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then
-    CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}$i, && num_gpus_assigned=$[$num_gpus_assigned+1]
-  fi
-# once we have enough GPUs, break out of the loop
-  [ $num_gpus_assigned -eq $num_gpus ] && break
-done
+if [ $num_gpus -eq 0 ] ; then
+    echo "$0: Running the job on CPU. Disabling submitting to gpu"
+    export CUDA_VISIBLE_DEVICES=""
+else
+    for i in `seq 0 $[$num_total_gpus-1]`; do
+    # going over all GPUs and check if it is idle, and add to the list if yes
+      if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then
+        CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}$i, && num_gpus_assigned=$[$num_gpus_assigned+1]
+      fi
+    # once we have enough GPUs, break out of the loop
+      [ $num_gpus_assigned -eq $num_gpus ] && break
+    done
 
-[ $num_gpus_assigned -ne $num_gpus ] && echo Could not find enough idle GPUs && exit 1
+    [ $num_gpus_assigned -ne $num_gpus ] && echo Could not find enough idle GPUs && exit 1
 
-export CUDA_VISIBLE_DEVICES=$(echo $CUDA_VISIBLE_DEVICES | sed "s=,$==g")
+    export CUDA_VISIBLE_DEVICES=$(echo $CUDA_VISIBLE_DEVICES | sed "s=,$==g")
+
+    echo "$0: Running the job on GPU(s) $CUDA_VISIBLE_DEVICES"
+fi
 
-echo "$0: Running the job on GPU(s) $CUDA_VISIBLE_DEVICES"
 "$@"
diff --git a/egs/wsj/s5/utils/perturb_data_dir_speed.sh b/egs/wsj/s5/utils/perturb_data_dir_speed.sh
index a50cdb04be4..99c9cbdb1f0 100755
--- a/egs/wsj/s5/utils/perturb_data_dir_speed.sh
+++ b/egs/wsj/s5/utils/perturb_data_dir_speed.sh
@@ -102,6 +102,9 @@ fi
 if [ -f $srcdir/spk2gender ]; then
   utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
 fi
+if [ -f $srcdir/utt2lang ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2lang >$destdir/utt2lang
+fi
 
 #prepare speed-perturbed utt2dur
 if [ ! -f $srcdir/utt2dur ]; then
diff --git a/egs/yomdle_fa/v1/local/prepare_dict.sh b/egs/yomdle_fa/v1/local/prepare_dict.sh
index f1b1a8d70cc..8d14130d8c0 100755
--- a/egs/yomdle_fa/v1/local/prepare_dict.sh
+++ b/egs/yomdle_fa/v1/local/prepare_dict.sh
@@ -18,7 +18,7 @@ mkdir -p $dir
 
 local/prepare_lexicon.py --data-dir $data_dir $dir
 
-sed -i '/^\s*$/d' $dir/lexicon.txt
+perl -i -ne 'print if /\S/' $dir/lexicon.txt
 cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
 
 echo '<sil> SIL' >> $dir/lexicon.txt
diff --git a/egs/yomdle_russian/README.txt b/egs/yomdle_russian/README.txt
new file mode 100644
index 00000000000..3bf4cc8cd2d
--- /dev/null
+++ b/egs/yomdle_russian/README.txt
@@ -0,0 +1,3 @@
+This directory contains example scripts for OCR on the Yomdle and Slam datasets.
+Training is done on the Yomdle dataset and testing is done on Slam.
+LM rescoring is also done with extra corpus data obtained from various sources
diff --git a/egs/yomdle_russian/v1/cmd.sh b/egs/yomdle_russian/v1/cmd.sh
new file mode 100755
index 00000000000..3d69546dfe8
--- /dev/null
+++ b/egs/yomdle_russian/v1/cmd.sh
@@ -0,0 +1,12 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+export cmd="queue.pl"
diff --git a/egs/yomdle_russian/v1/image b/egs/yomdle_russian/v1/image
new file mode 120000
index 00000000000..1668ee99922
--- /dev/null
+++ b/egs/yomdle_russian/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
\ No newline at end of file
diff --git a/egs/yomdle_russian/v1/local/chain/compare_wer.sh b/egs/yomdle_russian/v1/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..80f31e0f311
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/chain/compare_wer.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# WER (rescored)             "
+for x in $*; do
+  wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+echo -n "# CER (rescored)             "
+for x in $*; do
+  cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/yomdle_russian/v1/local/chain/run_cnn_e2eali.sh b/egs/yomdle_russian/v1/local/chain/run_cnn_e2eali.sh
new file mode 120000
index 00000000000..e2545b0186e
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1a.sh
\ No newline at end of file
diff --git a/egs/yomdle_russian/v1/local/chain/run_e2e_cnn.sh b/egs/yomdle_russian/v1/local/chain/run_e2e_cnn.sh
new file mode 100755
index 00000000000..6f5742cd34b
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/chain/run_e2e_cnn.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+# Copyright    2017  Hossein Hadian
+# This script does end2end chain training (i.e. from scratch)
+# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+# System                      e2e_cnn_1a
+#                             score_basic      rescoring + nomalized
+# WER                             16.24        11.0
+# WER (rescored)                  15.63        10.5
+# CER                              5.98         5.6
+# CER (rescored)                   5.66         5.3
+# Final train prob               0.1376
+# Final valid prob               0.1913
+# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a
+# exp/chain/e2e_cnn_1a: num-iters=27 nj=5..8 num-params=3.0M dim=40->470 combine=0.091->0.091 (over 1) logprob:train/valid[17,26,final]=(0.135,0.137,0.138/0.191,0.191,0.191)
+
+set -e
+# configs for 'chain'
+stage=0
+nj=30
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+tdnn_dim=450
+minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e
+treedir=exp/chain/e2e_monotree  # it's actually just a trivial tree (no tree building)
+dir=exp/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type mono \
+                                       data/$train_set $lang $treedir
+  $cmd $treedir/log/make_phone_lm.log \
+  cat data/$train_set/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
+    utils/sym2int.pl -f 2- data/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=500 \
+                       ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.apply-deriv-weights true \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 3 \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial 5 \
+    --trainer.optimization.num-jobs-final 8 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
diff --git a/egs/yomdle_russian/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/yomdle_russian/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
new file mode 100755
index 00000000000..7301db33d85
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+
+# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a
+# System                      cnn_e2eali_1a      rescoring + nomalized
+# WER                             12.08          7.7
+# WER (rescored)                  11.90          7.5
+# CER                              3.60          3.4
+# CER (rescored)                   3.42          3.2
+# Final train prob              -0.0373
+# Final valid prob              -0.0362
+# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a
+# exp/chain/cnn_e2eali_1a: num-iters=74 nj=3..16 num-params=6.3M dim=40->848 combine=-0.039->-0.039 (over 1) xent:train/valid[48,73,final]=(-0.206,-0.153,-0.146/-0.191,-0.156,-0.151) logprob:train/valid[48,73,final]=(-0.044,-0.038,-0.037/-0.040,-0.037,-0.036)
+
+set -e -o pipefail
+stage=0
+nj=30
+train_set=train
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=1000
+# we don't need extra left/right context for TDNN systems.
+tdnn_dim=550
+# training options
+srand=0
+remove_egs=false
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
+  tdnn_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.04"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-dropout-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-8,-4,0,4,8) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=16 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
diff --git a/egs/yomdle_russian/v1/local/check_tools.sh b/egs/yomdle_russian/v1/local/check_tools.sh
new file mode 100755
index 00000000000..5b4d3107d3b
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/check_tools.sh
@@ -0,0 +1,43 @@
+#!/bin/bash -u
+
+# Copyright 2015 (c) Johns Hopkins University (Jan Trmal <jtrmal@gmail.com>)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+[ -f ./path.sh ] && . ./path.sh
+set +e
+
+command -v python3 >&/dev/null \
+  || { echo  >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; }
+
+python3 -c "import numpy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs numpy installed."
+  exit 1
+fi
+
+python3 -c "import scipy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy installed."
+  exit 1
+fi
+
+python3 -c "import scipy.misc; scipy.misc.__dict__['imread']"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy-image and  Pillow installed."
+  exit 1
+fi
+
+
+exit  0
diff --git a/egs/yomdle_russian/v1/local/extract_features.sh b/egs/yomdle_russian/v1/local/extract_features.sh
new file mode 100755
index 00000000000..3880ebad3e8
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/extract_features.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright   2017 Yiwen Shao
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script runs the make features script in parallel. 
+
+nj=4
+cmd=run.pl
+feat_dim=40
+augment='no_aug'
+fliplr=false
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+data=$1
+featdir=$data/data
+scp=$data/images.scp
+logdir=$data/log
+
+mkdir -p $logdir
+mkdir -p $featdir
+
+# make $featdir an absolute pathname
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/images.$n.scp"
+done
+
+# split images.scp
+utils/split_scp.pl $scp $split_scps || exit 1;
+
+$cmd JOB=1:$nj $logdir/extract_features.JOB.log \
+  image/ocr/make_features.py $logdir/images.JOB.scp \
+    --allowed_len_file_path $data/allowed_lengths.txt \
+    --feat-dim $feat_dim --fliplr $fliplr --augment_type $augment \| \
+    copy-feats --compress=true --compression-method=7 \
+    ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
+
+## aggregates the output scp's to get feats.scp
+for n in $(seq $nj); do
+  cat $featdir/images.$n.scp || exit 1;
+done > $data/feats.scp || exit 1
diff --git a/egs/yomdle_russian/v1/local/prepare_dict.sh b/egs/yomdle_russian/v1/local/prepare_dict.sh
new file mode 100755
index 00000000000..22db5ae834d
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/prepare_dict.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+# Copyright      2017  Hossein Hadian
+#                2017  Babak Rekabdar
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+. ./utils/parse_options.sh || exit 1;
+
+mkdir -p $dir
+
+local/prepare_lexicon.py $dir
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
diff --git a/egs/yomdle_russian/v1/local/prepare_lexicon.py b/egs/yomdle_russian/v1/local/prepare_lexicon.py
new file mode 100755
index 00000000000..a68b1cb49dd
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/prepare_lexicon.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Babak Rekabdar
+#                2017  Hossein Hadian
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon for BPE. It gets the set of all words that occur in data/train/text.
+# Since this lexicon is based on BPE, it replaces '|' with silence.
+
+import argparse
+import os
+import unicodedata
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+parser.add_argument('dir', type=str, help='output path')
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data', 'train', 'text')
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line_vect = line.strip().split(' ')
+        for i in range(1, len(line_vect)):
+            characters = list(line_vect[i])
+            characters = " ".join([ 'SIL' if char == '|' else char for char in characters])
+            characters = list(characters)
+            characters = "".join([ '<HASH>' if char == '#' else char for char in characters])
+            lex[line_vect[i]] = characters
+
+with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + " " + lex[key] + "\n")
diff --git a/egs/yomdle_russian/v1/local/process_corpus.py b/egs/yomdle_russian/v1/local/process_corpus.py
new file mode 100755
index 00000000000..b39030270b7
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/process_corpus.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+# Copyright      2018  Ashish Arora
+# Apache 2.0
+# This script reads valid phones and removes the lines in the corpus
+# which have any other phone.
+
+import os
+import sys, io
+
+phone_file = os.path.join('data/local/text/cleaned/phones.txt')
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+phone_dict = dict()
+with open(phone_file, 'r', encoding='utf-8') as phone_fh:
+    for line in phone_fh:
+        line = line.strip().split()[0]
+        phone_dict[line] = line
+
+phone_dict[' '] = ' '
+corpus_text = list()
+for line in infile:
+    text = line.strip()
+    skip_text = False
+    for phone in text:
+        if phone not in phone_dict.keys():
+            skip_text = True
+            break
+    if not skip_text:
+        output.write(text+ '\n')
+
diff --git a/egs/yomdle_russian/v1/local/process_data.py b/egs/yomdle_russian/v1/local/process_data.py
new file mode 100755
index 00000000000..d7546b0a803
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/process_data.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+#                2018  Chun Chieh Chang
+
+""" This script reads the extracted Tamil OCR (yomdle and slam) database files 
+    and creates the following files (for the data subset selected via --dataset):
+    text, utt2spk, images.scp.
+  Eg. local/process_data.py data/download/ data/local/splits/train.txt data/train
+
+  Eg. text file: english_phone_books_0001_1 To sum up, then, it would appear that
+      utt2spk file: english_phone_books_0001_0 english_phone_books_0001
+      images.scp file: english_phone_books_0001_0 \
+      data/download/truth_line_image/english_phone_books_0001_0.png
+"""
+
+import argparse
+import os
+import sys
+import csv
+import itertools
+import unicodedata
+import re
+import string
+import unicodedata
+parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files")
+parser.add_argument('database_path', type=str, help='Path to data')
+parser.add_argument('data_split', type=str, help='Path to file that contain datasplits')
+parser.add_argument('out_dir', type=str, help='directory to output files')
+args = parser.parse_args()
+
+### main ###
+print("Processing '{}' data...".format(args.out_dir))
+
+text_file = os.path.join(args.out_dir, 'text')
+text_fh = open(text_file, 'w', encoding='utf-8')
+utt2spk_file = os.path.join(args.out_dir, 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8')
+image_file = os.path.join(args.out_dir, 'images.scp')
+image_fh = open(image_file, 'w', encoding='utf-8')
+
+with open(args.data_split) as f:
+    for line in f:
+        line = line.strip()
+        image_id = line
+        image_filename = image_id + '.png'
+        image_filepath = os.path.join(args.database_path, 'truth_line_image', image_filename)
+        if not os.path.isfile (image_filepath):
+            print("File does not exist {}".format(image_filepath))
+            continue
+        line_id = int(line.split('_')[-1])
+        csv_filename = '_'.join(line.split('_')[:-1]) + '.csv'
+        csv_filepath = os.path.join(args.database_path, 'truth_csv', csv_filename)
+        csv_file = open(csv_filepath, 'r', encoding='utf-8')
+        for row in csv.reader(csv_file):
+            if row[1] == image_filename:
+                text = row[11]
+                text_vect = text.split() # this is to avoid non-utf-8 spaces
+                text = " ".join(text_vect)
+                #text_normalized = unicodedata.normalize('NFD', text).replace('\n', '')
+                if not text:
+                    continue
+                text_fh.write(image_id + ' ' + text + '\n')
+                utt2spk_fh.write(image_id + ' ' + '_'.join(line.split('_')[:-1]) + '\n')
+                image_fh.write(image_id + ' ' + image_filepath +  '\n')
diff --git a/egs/yomdle_russian/v1/local/score.sh b/egs/yomdle_russian/v1/local/score.sh
new file mode 100755
index 00000000000..31564d25326
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/score.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/yomdle_russian/v1/local/train_lm.sh b/egs/yomdle_russian/v1/local/train_lm.sh
new file mode 100755
index 00000000000..c73c42fb7dc
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/train_lm.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains a LM on the training transcriptions and corpus text.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+dir=data/local/local_lm
+order=6
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.031,0.860,0.678,0.194,0.037,0.006,0.928,0.712,0.454,0.220,0.926,0.844,0.749,0.358,0.966,0.879,0.783,0.544,0.966,0.826,0.674,0.450"
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # use the validation data as the dev set.
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+
+  cat data/local/text/cleaned/bpe_val.txt  > ${dir}/data/text/dev.txt
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  cat data/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+  cat data/local/text/cleaned/bpe_corpus.txt > ${dir}/data/text/corpus_text.txt
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from train and corpus text
+  cat ${dir}/data/text/{train,corpus_text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='train=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: pruning the LM (to larger size)"
+  # Using 10 million n-grams for a big LM for rescoring purposes.
+  size=10000000
+  prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
+  #[perplexity = 22.0613098868] over 151116.0 words
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: pruning the LM (to smaller size)"
+  # Using 2 million n-grams for a smaller LM for graph building.  Prune from the
+  # bigger-pruned LM, it'll be faster.
+  size=2000000
+  prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
+  #[perplexity = 23.4801171202] over 151116.0 words
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
+fi
diff --git a/egs/yomdle_russian/v1/local/wer_output_filter b/egs/yomdle_russian/v1/local/wer_output_filter
new file mode 100755
index 00000000000..59e364e0231
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/wer_output_filter
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Hossein Hadian
+
+# Apache 2.0
+# This script converts a BPE-encoded text to normal text. It is used in scoring
+
+import sys, io
+import string
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+for line in infile:
+  words = line.strip().split()
+  uttid = words[0]
+  transcript = ''.join(words[1:])
+  transcript = transcript.replace('|', ' ')
+  output.write(uttid + ' ' + transcript + '\n')
diff --git a/egs/yomdle_russian/v1/local/yomdle b/egs/yomdle_russian/v1/local/yomdle
new file mode 120000
index 00000000000..2c4544c1399
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/yomdle
@@ -0,0 +1 @@
+../../../yomdle_tamil/v1/local/yomdle/
\ No newline at end of file
diff --git a/egs/yomdle_russian/v1/path.sh b/egs/yomdle_russian/v1/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/yomdle_russian/v1/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/yomdle_russian/v1/run_end2end.sh b/egs/yomdle_russian/v1/run_end2end.sh
new file mode 100755
index 00000000000..12beebeaa05
--- /dev/null
+++ b/egs/yomdle_russian/v1/run_end2end.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+
+# Copyright 2018    Hossein Hadian
+#                   Ashish Arora
+#                   Jonathan Chang
+# Apache 2.0
+
+set -e
+stage=0
+nj=30
+
+language_main=Russian
+slam_dir=/export/corpora5/slam/SLAM/
+yomdle_dir=/export/corpora5/slam/YOMDLE/
+corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/ru/
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+./local/check_tools.sh
+# Start from stage=-2 for data preparation. This stage stores line images,
+# csv files and splits{train,test,train_unsup} data/download/truth_line_image,
+# data/download/truth_csv and data/local/splits respectively.
+if [ $stage -le -2 ]; then
+  echo "$0: $(date): preparing data, obtaining line images and csv files..."
+  local/yomdle/create_download_dir.sh --language_main $language_main \
+    --slam_dir $slam_dir --yomdle_dir $yomdle_dir
+fi
+
+if [ $stage -le -1 ]; then
+  echo "$0: $(date): getting corpus text for language modelling..."
+  mkdir -p data/local/text/cleaned
+  cat $corpus_dir/* > data/local/text/ru.txt
+  head -20000 data/local/text/ru.txt > data/local/text/cleaned/val.txt
+  tail -n +20000 data/local/text/ru.txt > data/local/text/cleaned/corpus.txt
+fi
+
+mkdir -p data/{train,test}/data
+if [ $stage -le 0 ]; then
+  echo "$0: stage 0: Processing train and test data.$(date)"
+  echo "$0: creating text, images.scp, utt2spk and spk2utt"
+  #local/prepare_data.sh data/download/
+  for set in train test; do
+    local/process_data.py data/download/ \
+      data/local/splits/${set}.txt data/${set}
+    image/fix_data_dir.sh data/${set}
+  done
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: $(date) stage 1: getting allowed image widths for e2e training..."
+  image/get_image2num_frames.py --feat-dim 40 data/train
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
+  for set in train test; do
+    echo "$0: $(date) Extracting features, creating feats.scp file"
+    local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set}
+    steps/compute_cmvn_stats.sh data/${set} || exit 1;
+  done
+  image/fix_data_dir.sh data/train
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: $(date) stage 3: BPE preparation"
+  # getting non-silence phones.
+  cut -d' ' -f2- data/train/text | \
+python3 <(
+cat << "END"
+import os, sys, io;
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8');
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8');
+phone_dict = dict();
+for line in infile:
+    line_vect = line.strip().split();
+    for word in line_vect:
+        for phone in word:
+            phone_dict[phone] = phone;
+
+for phone in phone_dict.keys():
+      output.write(phone+ '\n');
+END
+   ) > data/local/text/cleaned/phones.txt
+
+  cut -d' ' -f2- data/train/text > data/local/text/cleaned/train.txt
+
+  echo "$0: learning BPE..."
+  # it is currently learned with only training text but we can also use all corpus text
+  # to learn BPE. phones are added so that one isolated occurance of every phone exists.
+  cat data/local/text/cleaned/phones.txt data/local/text/cleaned/train.txt | \
+    utils/lang/bpe/prepend_words.py | utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: $(date) stage 4: applying BPE..."
+  echo "$0: applying BPE on train, test text..."
+  for set in test train; do
+    cut -d' ' -f1 data/$set/text > data/$set/ids
+    cut -d' ' -f2- data/$set/text | utils/lang/bpe/prepend_words.py | \
+      utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
+      sed 's/@@//g' > data/$set/bpe_text
+    mv data/$set/text data/$set/text.old
+    paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
+    rm -f data/$set/bpe_text data/$set/ids
+  done
+
+  echo "$0: applying BPE to corpus text..."
+  cat data/local/text/cleaned/corpus.txt | utils/lang/bpe/prepend_words.py | \
+    utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
+    sed 's/@@//g' > data/local/text/cleaned/bpe_corpus.txt
+  cat data/local/text/cleaned/val.txt | utils/lang/bpe/prepend_words.py | \
+    utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
+    sed 's/@@//g' > data/local/text/cleaned/bpe_val.txt
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: $(date) stage 5: Preparing dictionary and lang..."
+  local/prepare_dict.sh --dir data/local/dict
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 4 --sil-prob 0.0 --position-dependent-phones false \
+    data/local/dict "<sil>" data/lang/temp data/lang
+  utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: $(date) stage 6: Calling the flat-start chain recipe..."
+  local/chain/run_e2e_cnn.sh
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: $(date) stage 7: Aligning the training data using the e2e chain model..."
+  steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
+    --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+    data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
+fi
+
+chunk_width='340,300,200,100'
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+if [ $stage -le 8 ]; then
+  echo "$0: $(date) stage 8: Building a tree and training a regular chain model using the e2e alignments..."
+  local/chain/run_cnn_e2eali.sh --chunk_width $chunk_width
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: $(date) stage 9: Estimating a language model for decoding..."
+  local/train_lm.sh
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \
+                     data/local/dict/lexicon.txt data/lang
+  utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
+                               data/lang data/lang_rescore_6g
+fi
+
+if [ $stage -le 10 ] && $decode_e2e; then
+  echo "$0: $(date) stage 10: decoding end2end setup..."
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
+
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj 30 --cmd "$cmd" --beam 12 \
+    exp/chain/e2e_cnn_1a/graph data/test exp/chain/e2e_cnn_1a/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test exp/chain/e2e_cnn_1a/decode_test{,_rescored} || exit 1
+
+  echo "$0: Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+fi
+
+if [ $stage -le 11 ] && $decode_chain; then
+  echo "$0: $(date) stage 11: decoding chain alignment setup..."
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    exp/chain/cnn_e2eali_1a/ exp/chain/cnn_e2eali_1a/graph || exit 1;
+
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj 30 --cmd "$cmd" --beam 12 \
+    exp/chain/cnn_e2eali_1a/graph data/test exp/chain/cnn_e2eali_1a/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test exp/chain/cnn_e2eali_1a/decode_test{,_rescored} || exit 1
+
+  echo "$0: Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a
+fi
diff --git a/egs/yomdle_russian/v1/steps b/egs/yomdle_russian/v1/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/yomdle_russian/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/yomdle_russian/v1/utils b/egs/yomdle_russian/v1/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/yomdle_russian/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/yomdle_zh/v1/local/create_download.sh b/egs/yomdle_zh/v1/local/create_download.sh
index a440a331747..1daad354473 100755
--- a/egs/yomdle_zh/v1/local/create_download.sh
+++ b/egs/yomdle_zh/v1/local/create_download.sh
@@ -43,4 +43,4 @@ local/create_line_image_from_page_image.py \
 
 echo "Downloading table for CangJie."
 wget -P $download_dir/ $cangjie_url || exit 1;
-sed -ie '1,8d' $download_dir/cj5-cc.txt
+perl -n -i -e 'print if $. > 8' $download_dir/cj5-cc.txt
diff --git a/scripts/rnnlm/train_rnnlm.sh b/scripts/rnnlm/train_rnnlm.sh
index d6d38f3d734..013e9a56c2f 100755
--- a/scripts/rnnlm/train_rnnlm.sh
+++ b/scripts/rnnlm/train_rnnlm.sh
@@ -213,7 +213,7 @@ while [ $x -lt $num_iters ]; do
              --read-rnnlm="$src_rnnlm" --write-rnnlm=$dir/$dest_number.raw \
              --read-embedding=$dir/${embedding_type}_embedding.$x.mat \
              --write-embedding=$dir/${embedding_type}_embedding.$dest_number.mat \
-             "ark,bg:cat $repeated_data | rnnlm-get-egs --srand=$num_splits_processed $train_egs_args - ark:- |" || touch $dir/.train_error &
+             "ark,bg:cat $repeated_data | rnnlm-get-egs --chunk-length=$chunk_length --srand=$num_splits_processed $train_egs_args - ark:- |" || touch $dir/.train_error &
       done
       wait # wait for just the training jobs.
       [ -f $dir/.train_error ] && \
diff --git a/src/Makefile b/src/Makefile
index 32301e254dd..bf500fb5d9c 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -31,16 +31,9 @@ include kaldi.mk
 
 # Reset the default goal, so that the all target will become default
 .DEFAULT_GOAL :=
-all:
-	$(MAKE) checkversion
-	$(MAKE) kaldi.mk
-	$(MAKE) mklibdir
-	$(MAKE) subdirs
-	$(MAKE) -C matrix test
+all: $(SUBDIRS) matrix/test
 	-echo Done
 
-subdirs: $(SUBDIRS)
-
 mklibdir:
 	test -d $(KALDILIBDIR) || mkdir $(KALDILIBDIR)
 
@@ -139,11 +132,11 @@ ext_depend: check_portaudio
 
 
 .PHONY: $(SUBDIRS)
-$(SUBDIRS) : mklibdir
+$(SUBDIRS) : checkversion kaldi.mk mklibdir
 	$(MAKE) -C $@
 
 .PHONY: $(EXT_SUBDIRS)
-$(EXT_SUBDIRS) : mklibdir ext_depend
+$(EXT_SUBDIRS) : checkversion kaldi.mk mklibdir ext_depend
 	$(MAKE) -C $@
 
 
diff --git a/src/base/io-funcs-inl.h b/src/base/io-funcs-inl.h
index 6b87f4c1a24..b703ef5addc 100644
--- a/src/base/io-funcs-inl.h
+++ b/src/base/io-funcs-inl.h
@@ -47,7 +47,7 @@ template<class T>  void WriteBasicType(std::ostream &os,
       os << t << " ";
   }
   if (os.fail()) {
-    throw std::runtime_error("Write failure in WriteBasicType.");
+    KALDI_ERR << "Write failure in WriteBasicType.";
   }
 }
 
@@ -122,7 +122,7 @@ inline void WriteIntegerPairVector(std::ostream &os, bool binary,
     os << "]\n";
   }
   if (os.fail()) {
-    throw std::runtime_error("Write failure in WriteIntegerPairVector.");
+    KALDI_ERR << "Write failure in WriteIntegerPairVector.";
   }
 }
 
@@ -224,7 +224,7 @@ template<class T> inline void WriteIntegerVector(std::ostream &os, bool binary,
     os << "]\n";
   }
   if (os.fail()) {
-    throw std::runtime_error("Write failure in WriteIntegerVector.");
+    KALDI_ERR << "Write failure in WriteIntegerVector.";
   }
 }
 
diff --git a/src/base/io-funcs.cc b/src/base/io-funcs.cc
index 90988faf3ea..ff9c921874e 100644
--- a/src/base/io-funcs.cc
+++ b/src/base/io-funcs.cc
@@ -138,7 +138,7 @@ void WriteToken(std::ostream &os, bool binary, const char *token) {
   CheckToken(token);  // make sure it's valid (can be read back)
   os << token << " ";
   if (os.fail()) {
-    throw std::runtime_error("Write failure in WriteToken.");
+    KALDI_ERR << "Write failure in WriteToken.";
   }
 }
 
diff --git a/src/base/io-funcs.h b/src/base/io-funcs.h
index 0144e71f987..6396967f56b 100644
--- a/src/base/io-funcs.h
+++ b/src/base/io-funcs.h
@@ -46,7 +46,7 @@ namespace kaldi {
   We also want to have control over whitespace in text mode without affecting
   the meaning of the file, for pretty-printing purposes.
 
-  Errors are handled by throwing an exception (std::runtime_error).
+  Errors are handled by throwing a KaldiFatalError exception.
 
   For integer and floating-point types (and boolean values):
 
diff --git a/src/base/kaldi-error-test.cc b/src/base/kaldi-error-test.cc
index 527de852cac..462ad956907 100644
--- a/src/base/kaldi-error-test.cc
+++ b/src/base/kaldi-error-test.cc
@@ -42,13 +42,12 @@ void UnitTestError() {
 }  // end namespace kaldi.
 
 int main() {
-  kaldi::g_program_name = "/foo/bar/kaldi-error-test";
+  kaldi::SetProgramName("/foo/bar/kaldi-error-test");
   try {
     kaldi::UnitTestError();
     KALDI_ASSERT(0);  // should not happen.
     exit(1);
-  } catch(std::runtime_error &r) {
-    std::cout << "UnitTestError: the error we generated was: " << r.what();
+  } catch(kaldi::KaldiFatalError &e) {
+    std::cout << "The error we generated was: '" << e.KaldiMessage() << "'\n";
   }
 }
-
diff --git a/src/base/kaldi-error.cc b/src/base/kaldi-error.cc
index 3eeebe01910..9705936466c 100644
--- a/src/base/kaldi-error.cc
+++ b/src/base/kaldi-error.cc
@@ -1,5 +1,6 @@
 // base/kaldi-error.cc
 
+// Copyright 2019 SmartAction LLC (kkm)
 // Copyright 2016 Brno University of Technology (author: Karel Vesely)
 // Copyright 2009-2011  Microsoft Corporation;  Lukas Burget;  Ondrej Glembek
 
@@ -35,88 +36,90 @@
 
 namespace kaldi {
 
+
 /***** GLOBAL VARIABLES FOR LOGGING *****/
 
 int32 g_kaldi_verbose_level = 0;
-const char *g_program_name = NULL;
-static LogHandler g_log_handler = NULL;
-
-// If the program name was set (g_program_name != ""), GetProgramName
-// returns the program name (without the path), e.g. "gmm-align".
-// Otherwise it returns the empty string "".
-const char *GetProgramName() {
-  return g_program_name == NULL ? "" : g_program_name;
+static std::string program_name;
+static LogHandler log_handler = NULL;
+
+void SetProgramName(const char *basename) {
+  // Using the 'static std::string' for the program name is mostly harmless,
+  // because (a) Kaldi logging is undefined before main(), and (b) no stdc++
+  // string implementation has been found in the wild that would not be just
+  // an empty string when zero-initialized but not yet constructed.
+  program_name = basename;
 }
 
+
 /***** HELPER FUNCTIONS *****/
 
-// Given a filename like "/a/b/c/d/e/f.cc",  GetShortFileName
-// returns "e/f.cc".  Does not currently work if backslash is
-// the filename separator.
-static const char *GetShortFileName(const char *filename) {
-  const char *last_slash = strrchr(filename, '/');
-  if (!last_slash) {
-    return filename;
-  } else {
-    while (last_slash > filename && last_slash[-1] != '/')
-      last_slash--;
-    return last_slash;
+// Trim filename to at most 1 trailing directory long. Given a filename like
+// "/a/b/c/d/e/f.cc", return "e/f.cc". Support both '/' and '\' as the path
+// separator.
+static const char *GetShortFileName(const char *path) {
+  if (path == nullptr)
+    return "";
+
+  const char *prev = path, *last = path;
+  while ((path = std::strpbrk(path, "\\/")) != nullptr) {
+    ++path;
+    prev = last;
+    last = path;
   }
+  return prev;
 }
 
 
-/***** STACKTRACE *****/
+/***** STACK TRACE *****/
 
+#ifdef HAVE_EXECINFO_H
 static std::string Demangle(std::string trace_name) {
-#if defined(HAVE_CXXABI_H) && defined(HAVE_EXECINFO_H)
-  // at input the string looks like:
+#ifdef HAVE_CXXABI_H
+  // At input the string looks like:
   //   ./kaldi-error-test(_ZN5kaldi13UnitTestErrorEv+0xb) [0x804965d]
-  // We want to extract the name e.g. '_ZN5kaldi13UnitTestErrorEv",
-  // demangle it and return it.
+  // We want to extract the name e.g. '_ZN5kaldi13UnitTestErrorEv"
+  // and demangle it.
 
-  // try to locate '(' and '+', take the string in between,
+  // Try to locate '(' and '+', take the string in between.
   size_t begin(trace_name.find("(")),
          end(trace_name.rfind("+"));
   if (begin != std::string::npos && end != std::string::npos && begin < end) {
-    trace_name = trace_name.substr(begin+1,end-(begin+1));
+    trace_name = trace_name.substr(begin + 1, end - (begin + 1));
   }
-  // demangle,
+  // Try to demangle function name.
   int status;
   char *demangled_name = abi::__cxa_demangle(trace_name.c_str(), 0, 0, &status);
-  std::string ans;
-  if (status == 0) {
-    ans = demangled_name;
+  if (status == 0 && demangled_name != NULL) {
+    trace_name = demangled_name;
     free(demangled_name);
-  } else {
-    ans = trace_name;
   }
-  // return,
-  return ans;
-#else
+#endif  // HAVE_CXXABI_H
   return trace_name;
-#endif
 }
-
+#endif  // HAVE_EXECINFO_H
 
 static std::string KaldiGetStackTrace() {
   std::string ans;
 #ifdef HAVE_EXECINFO_H
-#define KALDI_MAX_TRACE_SIZE 50
-#define KALDI_MAX_TRACE_PRINT 20  // must be even.
-  // buffer for the trace,
+  const size_t KALDI_MAX_TRACE_SIZE = 50;
+  const size_t KALDI_MAX_TRACE_PRINT = 20;  // Must be even.
+  // Buffer for the trace.
   void *trace[KALDI_MAX_TRACE_SIZE];
-  // get the trace,
+  // Get the trace.
   size_t size = backtrace(trace, KALDI_MAX_TRACE_SIZE);
-  // get the trace symbols,
+  // Get the trace symbols.
   char **trace_symbol = backtrace_symbols(trace, size);
+  if (trace_symbol == NULL)
+    return ans;
 
-  // Compose the 'string',
+  // Compose a human-readable backtrace string.
   ans += "[ Stack-Trace: ]\n";
   if (size <= KALDI_MAX_TRACE_PRINT) {
     for (size_t i = 0; i < size; i++) {
       ans += Demangle(trace_symbol[i]) + "\n";
     }
-  } else {  // print out first+last (e.g.) 5.
+  } else {  // Print out first+last (e.g.) 5.
     for (size_t i = 0; i < KALDI_MAX_TRACE_PRINT/2; i++) {
       ans += Demangle(trace_symbol[i]) + "\n";
     }
@@ -125,11 +128,12 @@ static std::string KaldiGetStackTrace() {
       ans += Demangle(trace_symbol[i]) + "\n";
     }
     if (size == KALDI_MAX_TRACE_SIZE)
-      ans += ".\n.\n.\n";  // stack was too long, probably a bug.
+      ans += ".\n.\n.\n";  // Stack was too long, probably a bug.
   }
 
-  // cleanup,
-  free(trace_symbol);  // it's okay, just the pointers, not the strings.
+  // We must free the array of pointers allocated by backtrace_symbols(),
+  // but not the strings themselves.
+  free(trace_symbol);
 #endif  // HAVE_EXECINFO_H
   return ans;
 }
@@ -142,114 +146,55 @@ MessageLogger::MessageLogger(LogMessageEnvelope::Severity severity,
   // Obviously, we assume the strings survive the destruction of this object.
   envelope_.severity = severity;
   envelope_.func = func;
-  envelope_.file = GetShortFileName(file);  // Pointer inside 'file'.
+  envelope_.file = GetShortFileName(file);  // Points inside 'file'.
   envelope_.line = line;
 }
 
+void MessageLogger::LogMessage() const {
+  // Send to the logging handler if provided.
+  if (log_handler != NULL) {
+    log_handler(envelope_, GetMessage().c_str());
+    return;
+  }
 
-MessageLogger::~MessageLogger() noexcept(false) {
-  std::string str = GetMessage();
-  // print the mesage (or send to logging handler),
-  MessageLogger::HandleMessage(envelope_, str.c_str());
-}
-
-std::string MessageLogger::GetMessage() const {
-  // remove trailing '\n',
-  std::string str = ss_.str();
-  while (!str.empty() && str[str.length() - 1] == '\n')
-    str.resize(str.length() - 1);
-  return str;
-}
-
-
-void MessageLogger::HandleMessage(const LogMessageEnvelope &envelope,
-                                  const char *message) {
-  // Send to a logging handler if provided.
-  if (g_log_handler != NULL) {
-    g_log_handler(envelope, message);
+  // Otherwise, use the default Kaldi logging.
+  // Build the log-message header.
+  std::stringstream full_message;
+  if (envelope_.severity > LogMessageEnvelope::kInfo) {
+    full_message << "VLOG[" << envelope_.severity << "] (";
   } else {
-    // Otherwise, we use the default Kaldi logging.
-    // Build the log-message 'header',
-    std::stringstream header;
-    if (envelope.severity > LogMessageEnvelope::kInfo) {
-      header << "VLOG[" << envelope.severity << "] (";
-    } else {
-      switch (envelope.severity) {
-        case LogMessageEnvelope::kInfo :
-          header << "LOG (";
-          break;
-        case LogMessageEnvelope::kWarning :
-          header << "WARNING (";
-          break;
-        case LogMessageEnvelope::kError :
-          header << "ERROR (";
-          break;
-        case LogMessageEnvelope::kAssertFailed :
-          header << "ASSERTION_FAILED (";
-          break;
-        default:
-          abort();  // coding error (unknown 'severity'),
-      }
-    }
-    // fill the other info from the envelope,
-    header << GetProgramName() << "[" KALDI_VERSION "]" << ':'
-           << envelope.func << "():" << envelope.file << ':' << envelope.line
-           << ")";
-
-    // Printing the message,
-    if (envelope.severity >= LogMessageEnvelope::kWarning) {
-      // VLOG, LOG, WARNING:
-      fprintf(stderr, "%s %s\n", header.str().c_str(), message);
-    } else {
-      // ERROR, ASSERT_FAILED (print with stack-trace):
-      fprintf(stderr, "%s %s\n\n%s\n", header.str().c_str(), message,
-              KaldiGetStackTrace().c_str());
+    switch (envelope_.severity) {
+    case LogMessageEnvelope::kInfo :
+      full_message << "LOG (";
+      break;
+    case LogMessageEnvelope::kWarning :
+      full_message << "WARNING (";
+      break;
+    case LogMessageEnvelope::kAssertFailed :
+      full_message << "ASSERTION_FAILED (";
+      break;
+    case LogMessageEnvelope::kError :
+    default:  // If not the ERROR, it still an error!
+      full_message << "ERROR (";
+      break;
     }
   }
-}
-
-FatalMessageLogger::FatalMessageLogger(LogMessageEnvelope::Severity severity,
-                                       const char *func, const char *file,
-                                       int32 line):
-  MessageLogger(severity, func, file, line) {
-  if (severity != LogMessageEnvelope::kAssertFailed &&
-      severity != LogMessageEnvelope::kError) {
-    // Don't use KALDI_ERR, since that will recursively instantiate
-    // MessageLogger.
-    throw std::runtime_error("FatalMessageLogger should be called only with "
-                             "severities kAssertFailed and kError");
+  // Add other info from the envelope and the message text.
+  full_message << program_name.c_str() << "[" KALDI_VERSION "]" << ':'
+               << envelope_.func  << "():" << envelope_.file << ':'
+               << envelope_.line << ") " << GetMessage().c_str();
+
+  // Add stack trace for errors and assertion failures, if available.
+  if (envelope_.severity < LogMessageEnvelope::kWarning) {
+    const std::string& stack_trace = KaldiGetStackTrace();
+    if (!stack_trace.empty()) {
+      full_message << "\n\n" << stack_trace;
+    }
   }
-}
-
-[[ noreturn ]] FatalMessageLogger::~FatalMessageLogger() noexcept(false) {
-  std::string str = GetMessage();
-
-  // print the mesage (or send to logging handler),
-  MessageLogger::HandleMessage(envelope_, str.c_str());
 
-  // Should we throw exception, or abort?
-  switch (envelope_.severity) {
-    case LogMessageEnvelope::kAssertFailed:
-      abort(); // ASSERT_FAILED,
-      break;
-    case LogMessageEnvelope::kError:
-      if (!std::uncaught_exception()) {
-        // throw exception with empty message,
-        throw std::runtime_error(""); // KALDI_ERR,
-      } else {
-        // If we got here, this thread has already thrown exception,
-        // and this exception has not yet arrived to its 'catch' clause...
-        // Throwing a new exception would be unsafe!
-        // (can happen during 'stack unwinding', if we have 'KALDI_ERR << msg'
-        // in a destructor of some local object).
-        abort();
-      }
-      break;
-  default: // This should never happen, based on constructor's
-           // preconditions. But we place abort() here so that all
-           // possible pathways through this function do not return.
-    abort();
-  }
+  // Print the complete message to stderr.
+  full_message << "\n";
+  std::cerr << full_message.str();
 }
 
 
@@ -257,17 +202,20 @@ FatalMessageLogger::FatalMessageLogger(LogMessageEnvelope::Severity severity,
 
 void KaldiAssertFailure_(const char *func, const char *file,
                          int32 line, const char *cond_str) {
-  FatalMessageLogger ml(LogMessageEnvelope::kAssertFailed, func, file, line);
-  ml.stream() << ": '" << cond_str << "' ";
+  MessageLogger::Log() =
+    MessageLogger (LogMessageEnvelope::kAssertFailed, func, file, line)
+      << "Assertion failed: (" << cond_str << ")";
+  fflush(NULL);  // Flush all pending buffers, abort() may not flush stderr.
+  std::abort();
 }
 
 
 /***** THIRD-PARTY LOG-HANDLER *****/
 
-LogHandler SetLogHandler(LogHandler new_handler) {
-  LogHandler old_handler = g_log_handler;
-  g_log_handler = new_handler;
+LogHandler SetLogHandler(LogHandler handler) {
+  LogHandler old_handler = log_handler;
+  log_handler = handler;
   return old_handler;
 }
 
-}  // end namespace kaldi
+}  // namespace kaldi
diff --git a/src/base/kaldi-error.h b/src/base/kaldi-error.h
index c643902f01b..c90a18b15f1 100644
--- a/src/base/kaldi-error.h
+++ b/src/base/kaldi-error.h
@@ -1,5 +1,6 @@
 // base/kaldi-error.h
 
+// Copyright 2019 SmartAction LLC (kkm)
 // Copyright 2016 Brno University of Technology (author: Karel Vesely)
 // Copyright 2009-2011  Microsoft Corporation;  Ondrej Glembek;  Lukas Burget;
 //                      Saarland University
@@ -42,22 +43,23 @@ namespace kaldi {
 /// \addtogroup error_group
 /// @{
 
-/***** VERBOSITY LEVEL *****/
+/***** PROGRAM NAME AND VERBOSITY LEVEL *****/
 
-/// This is set by util/parse-options.{h, cc} if you set --verbose=? option.
-extern int32 g_kaldi_verbose_level;
+/// Called by ParseOptions to set base name (no directory) of the executing
+/// program. The name is printed in logging code along with every message,
+/// because in our scripts, we often mix together the stderr of many programs.
+/// This function is very thread-unsafe.
+void SetProgramName(const char *basename);
 
-/// This is set by util/parse-options.{h, cc} (from argv[0]) and used (if set)
-/// in error reporting code to display the name of the program (this is because
-/// in our scripts, we often mix together the stderr of many programs).  it is
-/// the base-name of the program (no directory), followed by ':' We don't use
-/// std::string, due to the static initialization order fiasco.
-extern const char *g_program_name;
+/// This is set by util/parse-options.{h,cc} if you set --verbose=? option.
+/// Do not use directly, prefer {Get,Set}VerboseLevel().
+extern int32 g_kaldi_verbose_level;
 
+/// Get verbosity level, usually set via command line '--verbose=' switch.
 inline int32 GetVerboseLevel() { return g_kaldi_verbose_level; }
 
-/// This should be rarely used; command-line programs set the verbose level
-/// automatically from ParseOptions.
+/// This should be rarely used, except by programs using Kaldi as library;
+/// command-line programs set the verbose level automatically from ParseOptions.
 inline void SetVerboseLevel(int32 i) { g_kaldi_verbose_level = i; }
 
 
@@ -65,76 +67,106 @@ inline void SetVerboseLevel(int32 i) { g_kaldi_verbose_level = i; }
 
 /// Log message severity and source location info.
 struct LogMessageEnvelope {
+  /// Message severity. In addition to these levels, positive values (1 to 6)
+  /// specify verbose logging level. Verbose messages are produced only when
+  /// SetVerboseLevel() has been called to set logging level to at least the
+  /// corresponding value.
   enum Severity {
-    kAssertFailed = -3,
-    kError = -2,
-    kWarning = -1,
-    kInfo = 0,
+    kAssertFailed = -3,  //!< Assertion failure. abort() will be called.
+    kError = -2,    //!< Fatal error. KaldiFatalError will be thrown.
+    kWarning = -1,  //!< Indicates a recoverable but abnormal condition.
+    kInfo = 0,      //!< Informational message.
   };
-  // An 'enum Severity' value, or a positive number indicating verbosity level.
-  int severity;
-  const char *func;
-  const char *file;
-  int32 line;
+  int severity;      //!< A Severity value, or positive verbosity level.
+  const char *func;  //!< Name of the function invoking the logging.
+  const char *file;  //!< Source file name with up to 1 leading directory.
+  int32 line;        //<! Line number in the source file.
 };
 
-// Class MessageLogger is invoked from the KALDI_ASSERT, KALDI_ERR, KALDI_WARN and
-// KALDI_LOG macros. It formats the message, then either prints it to stderr or
-// passes to the log custom handler if provided, then, in case of the error,
-// throws an std::runtime_exception, in case of failed KALDI_ASSERT calls abort().
-//
-// Note: we avoid using std::cerr for thread safety issues.
-// fprintf(stderr,...) is guaranteed thread-safe, and outputs
-// its formatted string atomically.
+/// Kaldi fatal runtime error exception. This exception is thrown from any use
+/// of the KALDI_ERR logging macro after the logging function, either set by
+/// SetLogHandler(), or the Kaldi's internal one, has returned.
+class KaldiFatalError : public std::runtime_error {
+ public:
+  explicit KaldiFatalError(const std::string &message)
+      : std::runtime_error(message) { }
+  explicit KaldiFatalError(const char *message)
+      : std::runtime_error(message) { }
+
+  /// Returns the exception name, "kaldi::KaldiFatalError".
+  virtual const char *what() const noexcept override {
+    return "kaldi::KaldiFatalError";
+  }
+
+  /// Returns the Kaldi error message logged by KALDI_ERR.
+  const char *KaldiMessage() const { return std::runtime_error::what(); }
+};
+
+// Class MessageLogger is the workhorse behind the KALDI_ASSERT, KALDI_ERR,
+// KALDI_WARN, KALDI_LOG and KALDI_VLOG macros. It formats the message, then
+// either prints it to stderr or passes to the custom logging handler if
+// provided. Then, in case of the error, throws a KaldiFatalError exception, or
+// in case of failed KALDI_ASSERT, calls std::abort().
 class MessageLogger {
-public:
-  /// Constructor stores the info,
+ public:
+  /// The constructor stores the message's "envelope", a set of data which
+  // identifies the location in source which is sending the message to log.
+  // The pointers to strings are stored internally, and not owned or copied,
+  // so that their storage must outlive this object.
   MessageLogger(LogMessageEnvelope::Severity severity,
                 const char *func,
                 const char *file,
                 int32 line);
 
-  /// Destructor, calls 'HandleMessage' which prints the message,
-  /// (since C++11 a 'throwing' destructor must be declared 'noexcept(false)')
-  ~MessageLogger() noexcept(false);
+  // The stream insertion operator, used in e.g. 'KALDI_LOG << "Message"'.
+  template <typename T>
+  MessageLogger &operator<<(const T &val) {
+    ss_ << val;
+    return *this;
+  }
+
+  // When assigned a MessageLogger, log its contents.
+  struct Log final {
+    void operator=(const MessageLogger& logger) {
+      logger.LogMessage();
+    }
+  };
 
-  /// The hook for the 'insertion operator', e.g.
-  /// 'KALDI_LOG << "Message,"',
-  inline std::ostream &stream() { return ss_; }
+  // When assigned a MessageLogger, log its contents and then throw
+  // a KaldiFatalError.
+  struct LogAndThrow final {
+    [[ noreturn ]] void operator=(const MessageLogger& logger) {
+      logger.LogMessage();
+      throw KaldiFatalError(logger.GetMessage());
+    }
+  };
 
-protected:
-  std::string GetMessage() const;
-  /// The logging function,
-  static void HandleMessage(const LogMessageEnvelope &env, const char *msg);
+private:
+  std::string GetMessage() const { return ss_.str(); }
+  void LogMessage() const;
 
-protected:
   LogMessageEnvelope envelope_;
-
-private:
   std::ostringstream ss_;
 };
 
-class FatalMessageLogger: public MessageLogger {
-public:
-  FatalMessageLogger(LogMessageEnvelope::Severity severity,
-                     const char *func, const char *file, int32 line);
-
-  [[ noreturn ]] ~FatalMessageLogger() noexcept(false);
-};
-
-// The definition of the logging macros,
+// Logging macros.
 #define KALDI_ERR \
-  ::kaldi::FatalMessageLogger(::kaldi::LogMessageEnvelope::kError, \
-                              __func__, __FILE__, __LINE__).stream()
+  ::kaldi::MessageLogger::LogAndThrow() = \
+   ::kaldi::MessageLogger(::kaldi::LogMessageEnvelope::kError, \
+                          __func__, __FILE__, __LINE__)
 #define KALDI_WARN \
-  ::kaldi::MessageLogger(::kaldi::LogMessageEnvelope::kWarning, \
-                         __func__, __FILE__, __LINE__).stream()
+  ::kaldi::MessageLogger::Log() = \
+    ::kaldi::MessageLogger(::kaldi::LogMessageEnvelope::kWarning, \
+                           __func__, __FILE__, __LINE__)
 #define KALDI_LOG \
-  ::kaldi::MessageLogger(::kaldi::LogMessageEnvelope::kInfo, \
-                         __func__, __FILE__, __LINE__).stream()
-#define KALDI_VLOG(v) if ((v) <= ::kaldi::g_kaldi_verbose_level)     \
-  ::kaldi::MessageLogger((::kaldi::LogMessageEnvelope::Severity)(v), \
-                         __func__, __FILE__, __LINE__).stream()
+  ::kaldi::MessageLogger::Log() = \
+    ::kaldi::MessageLogger(::kaldi::LogMessageEnvelope::kInfo, \
+                           __func__, __FILE__, __LINE__)
+#define KALDI_VLOG(v) \
+  if ((v) <= ::kaldi::GetVerboseLevel()) \
+    ::kaldi::MessageLogger::Log() = \
+      ::kaldi::MessageLogger((::kaldi::LogMessageEnvelope::Severity)(v), \
+                             __func__, __FILE__, __LINE__)
 
 
 /***** KALDI ASSERTS *****/
@@ -142,17 +174,8 @@ class FatalMessageLogger: public MessageLogger {
 [[ noreturn ]] void KaldiAssertFailure_(const char *func, const char *file,
                                         int32 line, const char *cond_str);
 
-// Note on KALDI_ASSERT and KALDI_PARANOID_ASSERT
-// The original (simple) version of the code was this
-//
-// #define KALDI_ASSERT(cond) if (!(cond))
-//              kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);
+// Note on KALDI_ASSERT and KALDI_PARANOID_ASSERT:
 //
-// That worked well, but we were concerned that it
-// could potentially cause a performance issue due to failed branch
-// prediction (best practice is to have the if branch be the commonly
-// taken one).
-// Therefore, we decided to move the call into the else{} branch.
 // A single block {} around if /else  does not work, because it causes
 // syntax error (unmatched else block) in the following code:
 //
@@ -161,19 +184,21 @@ class FatalMessageLogger: public MessageLogger {
 // else
 //   SomethingElse();
 //
-// do {} while(0)  -- note there is no semicolon at the end! --- works nicely
+// do {} while(0) -- note there is no semicolon at the end! -- works nicely,
 // and compilers will be able to optimize the loop away (as the condition
 // is always false).
+//
+// Also see KALDI_COMPILE_TIME_ASSERT, defined in base/kaldi-utils.h, and
+// KALDI_ASSERT_IS_INTEGER_TYPE and KALDI_ASSERT_IS_FLOATING_TYPE, also defined
+// there.
 #ifndef NDEBUG
 #define KALDI_ASSERT(cond) do { if (cond) (void)0; else \
   ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); } while(0)
 #else
 #define KALDI_ASSERT(cond) (void)0
 #endif
-// Also see KALDI_COMPILE_TIME_ASSERT, defined in base/kaldi-utils.h,
-// and KALDI_ASSERT_IS_INTEGER_TYPE and KALDI_ASSERT_IS_FLOATING_TYPE,
-// also defined there.
-// some more expensive asserts only checked if this defined
+
+// Some more expensive asserts only checked if this defined.
 #ifdef KALDI_PARANOID
 #define KALDI_PARANOID_ASSERT(cond) do { if (cond) (void)0; else \
   ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); } while(0)
@@ -184,14 +209,15 @@ class FatalMessageLogger: public MessageLogger {
 
 /***** THIRD-PARTY LOG-HANDLER *****/
 
-/// Type of third-party logging function,
+/// Type of third-party logging function.
 typedef void (*LogHandler)(const LogMessageEnvelope &envelope,
                            const char *message);
 
 /// Set logging handler. If called with a non-NULL function pointer, the
-/// function pointed by it is called to send messages to a caller-provided
-/// log. If called with NULL pointer, restores default Kaldi error logging to
-/// stderr.  SetLogHandler is obviously not thread safe.
+/// function pointed by it is called to send messages to a caller-provided log.
+/// If called with a NULL pointer, restores default Kaldi error logging to
+/// stderr. This function is obviously not thread safe; the log handler must be.
+/// Returns a previously set logging handler pointer, or NULL.
 LogHandler SetLogHandler(LogHandler);
 
 /// @} end "addtogroup error_group"
diff --git a/src/base/kaldi-math.cc b/src/base/kaldi-math.cc
index 991e46a590c..17271f3c46f 100644
--- a/src/base/kaldi-math.cc
+++ b/src/base/kaldi-math.cc
@@ -21,6 +21,7 @@
 #include "base/kaldi-math.h"
 #ifndef _MSC_VER
 #include <stdlib.h>
+#include <unistd.h>
 #endif
 #include <string>
 #include <mutex>
@@ -42,7 +43,7 @@ int32 RoundUpToNearestPowerOfTwo(int32 n) {
 static std::mutex _RandMutex;
 
 int Rand(struct RandomState* state) {
-#if defined(_MSC_VER) || defined(__CYGWIN__)
+#if !defined(_POSIX_THREAD_SAFE_FUNCTIONS)
   // On Windows and Cygwin, just call Rand()
   return rand();
 #else
@@ -109,10 +110,8 @@ int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) {
       return min_val + ( (unsigned int)( (Rand(state)+RAND_MAX*Rand(state)))
                     % (unsigned int)(max_val+1-min_val));
     } else {
-      throw std::runtime_error(std::string()
-                               +"rand_int failed because we do not support "
-                               +"such large random numbers. "
-                               +"(Extend this function).");
+      KALDI_ERR << "rand_int failed because we do not support such large "
+          "random numbers. (Extend this function).";
     }
   }
 #else
diff --git a/src/bin/align-text.cc b/src/bin/align-text.cc
index 616dac858d7..1c695675274 100644
--- a/src/bin/align-text.cc
+++ b/src/bin/align-text.cc
@@ -86,28 +86,34 @@ int main(int argc, char *argv[]) {
 
       if (!text2_reader.HasKey(key)) {
         KALDI_WARN << "Key " << key << " is in " << text1_rspecifier
-            << ", but not in " << text2_rspecifier;
+                   << ", but not in " << text2_rspecifier;
         n_fail++;
         continue;
       }
       const std::vector<std::string> &text1 = text1_reader.Value();
       const std::vector<std::string> &text2 = text2_reader.Value(key);
 
-      // Checks if the special symbol is in the string.
-      KALDI_ASSERT(std::find(text1.begin(),
-                             text1.end(), special_symbol) == text1.end());
-      KALDI_ASSERT(std::find(text2.begin(),
-                             text2.end(), special_symbol) == text2.end());
-
       if (std::find_if(text1.begin(), text1.end(), IsNotToken) != text1.end()) {
-        KALDI_ERR << "In text1, the utterance " << key << " contains unprintable characters." \
-          << "That means there is a problem with the text (such as incorrect encoding)." << std::endl;
-        return  -1;
+        KALDI_ERR << "In text1, the utterance " << key
+                  << " contains unprintable characters. That means there is"
+                  << " a problem with the text (such as incorrect encoding).";
       }
       if (std::find_if(text2.begin(), text2.end(), IsNotToken) != text2.end()) {
-        KALDI_ERR << "In text2, the utterance " << key << " contains unprintable characters." \
-          << "That means there is a problem with the text (such as incorrect encoding)." << std::endl;
-        return  -1;
+        KALDI_ERR << "In text2, the utterance " << key
+                  << " contains unprintable characters. That means there is"
+                  << " a problem with the text (such as incorrect encoding).";
+      }
+
+      // Verify that the special symbol is not in the string.
+      if (std::find(text1.begin(), text1.end(), special_symbol) != text1.end()){
+        KALDI_ERR << "In text1, the utterance " << key
+                  << " contains the special symbol '" << special_symbol
+                  << "'. This is not allowed.";
+      }
+      if (std::find(text2.begin(), text2.end(), special_symbol) != text2.end()){
+        KALDI_ERR << "In text2, the utterance " << key
+                  << " contains the special symbol '" << special_symbol
+                  << "'. This is not allowed.";
       }
 
       std::vector<std::pair<std::string, std::string> > aligned;
diff --git a/src/bin/compute-wer-bootci.cc b/src/bin/compute-wer-bootci.cc
index b8b0697af75..ba2a4ce739c 100644
--- a/src/bin/compute-wer-bootci.cc
+++ b/src/bin/compute-wer-bootci.cc
@@ -162,7 +162,7 @@ int main(int argc, char *argv[]) {
 
   try {
     const char *usage =
-      "Compute a bootstrapping of WER to extract the 95\% confidence interval.\n"
+      "Compute a bootstrapping of WER to extract the 95% confidence interval.\n"
       "Take a reference and a transcription file, in integer or text format,\n"
       "and outputs overall WER statistics to standard output along with its\n"
       "confidence interval using the bootstrap method of Bisani and Ney.\n"
@@ -234,12 +234,12 @@ int main(int argc, char *argv[]) {
     std::cout.precision(2);
     std::cerr.precision(2);
     std::cout << "Set1: %WER " << std::fixed << 100*mean_wer <<
-              " 95\% Conf Interval [ " << 100*mean_wer-100*interval <<
+              " 95% Conf Interval [ " << 100*mean_wer-100*interval <<
               ", " << 100*mean_wer+100*interval << " ]" << '\n';
 
     if(!hyp2_rspecifier.empty()) {
         std::cout << "Set2: %WER " << std::fixed << 100*mean_wer2 <<
-            " 95\% Conf Interval [ " << 100*mean_wer2-100*interval2 <<
+            " 95% Conf Interval [ " << 100*mean_wer2-100*interval2 <<
             ", " << 100*mean_wer2+100*interval2 << " ]" << '\n';
 
         std::cout << "Probability of Set2 improving Set1: " << std::fixed <<
diff --git a/src/bin/draw-tree.cc b/src/bin/draw-tree.cc
index c9be5586933..d107ab1cfac 100644
--- a/src/bin/draw-tree.cc
+++ b/src/bin/draw-tree.cc
@@ -34,25 +34,23 @@ void MakeEvent(std::string &qry, fst::SymbolTable *phone_syms,
     if (key == kPdfClass) {
       value = static_cast<EventValueType>(atoi(valstr.c_str()));
       if (value < 0) { // not valid pdf-class
-        KALDI_ERR << "Bad query: invalid pdf-class ("
-                  << valstr << ')' << std::endl << std::endl;
+        KALDI_ERR << "Bad query: invalid pdf-class (" << valstr << ')';
       }
     }
     else {
       value = static_cast<EventValueType>(phone_syms->Find(valstr.c_str()));
       if (value == -1) { // fst::kNoSymbol
-        KALDI_ERR << "Bad query: invalid symbol ("
-                  << valstr << ')' << std::endl << std::endl;
+        KALDI_ERR << "Bad query: invalid symbol (" << valstr << ')';
       }
     }
     query_event->push_back(std::make_pair(key++, value));
     old_found = found + 1;
   }
   std::string valstr = qry.substr(old_found);
-  EventValueType value = static_cast<EventValueType>(phone_syms->Find(valstr.c_str()));
+  EventValueType value =
+      static_cast<EventValueType>(phone_syms->Find(valstr.c_str()));
   if (value == -1) { // fst::kNoSymbol
-    KALDI_ERR << "Bad query: invalid symbol ("
-              << valstr << ')' << std::endl << std::endl;
+    KALDI_ERR << "Bad query: invalid symbol (" << valstr << ')';
   }
   query_event->push_back(std::make_pair(key, value));
 
diff --git a/src/configure b/src/configure
index c156f253376..b21cc48f7ee 100755
--- a/src/configure
+++ b/src/configure
@@ -612,8 +612,8 @@ function linux_configure_redhat_fat {
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
-  echo "Successfully configured for red hat [dynamic libraries, fat] with ATLASLIBS =$ATLASLIBS"
   $use_cuda && configure_cuda
+  echo "Successfully configured for red hat [dynamic libraries, fat] with ATLASLIBS =$ATLASLIBS"
 }
 
 function linux_configure_atlas_static {
diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h
index 23b20501d4c..567cc0f6d18 100644
--- a/src/cudamatrix/cu-array-inl.h
+++ b/src/cudamatrix/cu-array-inl.h
@@ -105,8 +105,9 @@ void CuArrayBase<T>::CopyFromVec(const std::vector<T> &src) {
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     CU_SAFE_CALL(
-        cudaMemcpy(data_, &src.front(), src.size() * sizeof(T),
-                   cudaMemcpyHostToDevice));
+        cudaMemcpyAsync(data_, &src.front(), src.size() * sizeof(T),
+                   cudaMemcpyHostToDevice, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -122,7 +123,9 @@ void CuArray<T>::CopyFromVec(const std::vector<T> &src) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemcpy(this->data_, &src.front(), src.size()*sizeof(T), cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(this->data_, &src.front(), 
+          src.size()*sizeof(T), cudaMemcpyHostToDevice, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -179,7 +182,9 @@ void CuArrayBase<T>::CopyToVec(std::vector<T> *dst) const {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemcpy(&dst->front(), Data(), this->dim_ * sizeof(T), cudaMemcpyDeviceToHost));
+    CU_SAFE_CALL(cudaMemcpyAsync(&dst->front(), Data(), this->dim_ * sizeof(T),
+          cudaMemcpyDeviceToHost, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim);
   } else
 #endif
@@ -196,7 +201,9 @@ void CuArrayBase<T>::CopyToHost(T *dst) const {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemcpy(dst, Data(), this->dim_ * sizeof(T), cudaMemcpyDeviceToHost));
+    CU_SAFE_CALL(cudaMemcpyAsync(dst, Data(), this->dim_ * sizeof(T),
+          cudaMemcpyDeviceToHost, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim);
   } else
 #endif
@@ -212,7 +219,9 @@ void CuArrayBase<T>::SetZero() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemset(this->data_, 0, this->dim_ * sizeof(T)));
+    CU_SAFE_CALL(cudaMemsetAsync(this->data_, 0, this->dim_ * sizeof(T),
+          cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuArray::SetZero", tim);
   } else
 #endif
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 49c179b3673..85c2492c074 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -110,6 +110,16 @@ void CuDevice::Initialize() {
     // Initialize CUBLAS.
     CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_));
     CUBLAS_SAFE_CALL(cublasSetStream(cublas_handle_, cudaStreamPerThread));
+    
+    #if CUDA_VERSION >= 9000 
+    if (device_options_.use_tensor_cores) {
+      // Enable tensor cores in CUBLAS
+      // Note if the device does not support tensor cores this will fall back to normal math mode
+      CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_, 
+            CUBLAS_TENSOR_OP_MATH));
+    }
+    #endif
+
     // Initialize the cuSPARSE library
     CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
     CUSPARSE_SAFE_CALL(cusparseSetStream(cusparse_handle_, cudaStreamPerThread));
@@ -525,6 +535,8 @@ CuDevice::~CuDevice() {
 // Each thread has its own copy of the CuDevice object.
 // Note: this was declared "static".
 thread_local CuDevice CuDevice::this_thread_device_;
+  
+CuDevice::CuDeviceOptions CuDevice::device_options_;
 
 // define and initialize the static members of the CuDevice object.
 int32 CuDevice::device_id_ = -1;
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index dc3df7e347d..8816f9d223b 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -184,8 +184,31 @@ class CuDevice {
   /// (i.e. from outside the class), call this only if Enabled() returns true.
   bool IsComputeExclusive();
 
+  // Register command line options for CUDA device.  
+  // This must be done before calling CuDevice::Initialize()
+  // Example:
+  //  CuDevice::RegisterDeviceOptions(&po);
+  //  po.Read(argc, argv);
+  //  CuDevice::Initialize();
+  static void RegisterDeviceOptions(OptionsItf *po) {
+    CuDevice::device_options_.Register(po);  
+  }
   ~CuDevice();
  private:
+
+  struct CuDeviceOptions {
+    bool use_tensor_cores; // Enable tensor cores
+    CuDeviceOptions () : use_tensor_cores(false) {};
+    void Register(OptionsItf *po) {
+      po->Register("cuda-use-tensor-cores", &use_tensor_cores, 
+          "Enable FP16 tensor math. "
+          "This is higher performance but less accuracy. "
+          "This is only recommended for inference.");
+    }
+  };
+
+  static CuDeviceOptions device_options_;
+
   // Default constructor used to initialize this_thread_device_
   CuDevice();
   CuDevice(CuDevice&); // Disallow.
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 5a5307b9f87..17d56a05772 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -28,7 +28,7 @@
 #include <limits>
 #include <math_constants.h>
 #include "cudamatrix/cu-kernels-ansi.h"
-
+#include <cub/block/block_reduce.cuh>
 
 
 /***********************************************************************
@@ -958,6 +958,7 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
     Real trans[TileDim][TileDim + 1];
     Real sum[CU1DBLOCK];
   } smem;
+
   // linear thread id;
   const int32_cuda tid = threadIdx.y * blockDim.x + threadIdx.x;
   const int32_cuda grid_height = gridDim.y * TileDim;
@@ -1021,6 +1022,7 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
   if (tid == 0) {
     value[blockIdx.y * gridDim.x + blockIdx.x] = smem.sum[0];
   }
+
 }
 
 // _trace_mat_mat_trans reduce the partial sum to
@@ -1030,6 +1032,7 @@ __global__
 static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA,
                                  int B_stride, Real* value) {
   __shared__ Real ssum[CU1DBLOCK];
+
   // linear thread id;
   const int32_cuda tid = threadIdx.y * blockDim.x + threadIdx.x;
   const int32_cuda j = blockIdx.x * blockDim.x + threadIdx.x;
@@ -1046,7 +1049,7 @@ static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA,
   }
   ssum[tid] = tsum;
   __syncthreads();
-
+  
   // Block reduce
 # pragma unroll
   for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
@@ -2485,6 +2488,8 @@ template<typename Real>
 __global__
 static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
   __shared__ Real smem[CU1DBLOCK];
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
   const int i = blockIdx.x;
   const int x_start = i * src_stride;
   const int y_start = i * d.stride;
@@ -2496,24 +2501,9 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
   for (int j = tid; j < d.cols; j += CU1DBLOCK) {
     tmax = fmax(tmax, x[x_start + j]);
   }
-  smem[tid] = tmax;
-  __syncthreads();
-
-  // reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      smem[tid] = fmax(smem[tid], smem[tid + shift]);
-    }
-    __syncthreads();
-  }
-
-  // reduce to 1 element per row
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] = fmax(smem[tid], smem[tid + shift]);
-    }
+  tmax = BlockReduceT(temp_storage).Reduce(tmax, cub::Max());
+  if (tid == 0) {
+    smem[0] = tmax;
   }
 
   // broadcast max to all threads
@@ -2526,24 +2516,9 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
   for (int j = tid; j < d.cols; j += CU1DBLOCK) {
     tsum += exp(x[x_start + j] - max);
   }
-  smem[tid] = tsum;
-  __syncthreads();
-
-  // reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      smem[tid] += smem[tid + shift];
-    }
-    __syncthreads();
-  }
-
-  // reduce to 1 element per row
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] += smem[tid + shift];
-    }
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
+  if (tid == 0) {
+    smem[0] = tsum;
   }
 
   // broadcast sum to all threads
@@ -2577,6 +2552,8 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
   const int i = blockIdx.x;
   const int tid = threadIdx.x;
   const Real* x_row = x + i * x_d.stride;
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
   __shared__ Real ssum[CU1DBLOCK];
 
   // Reduce x_j^2 to CU1DBLOCK elements per row
@@ -2584,34 +2561,14 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
   for (int j = tid; j < x_d.cols; j += CU1DBLOCK) {
     tsum += x_row[j] * x_row[j];
   }
-  ssum[tid] = tsum;
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
   __syncthreads();
-
-  // Tree reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift)
-      ssum[tid] += ssum[tid + shift];
-    __syncthreads();
-  }
-
-  // Reduce last warp to 1 element per row.
-  // Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
-    }
-  }
+  
 
   const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
-  if (tid == 0) {
-    ssum[0] = sqrt(
-        fmax(ssum[0] / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
-  }
+  ssum[tid] = sqrt(
+    fmax(tsum / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
 
-  // Broadcast floored stddev to all threads.
-  __syncthreads();
   const Real stddev_div_target_rms = ssum[0];
   const Real scale = Real(1) / stddev_div_target_rms;
 
@@ -2626,7 +2583,6 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
   }
 }
 
-
 template<typename Real>
 __global__
 static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv,
@@ -2722,6 +2678,8 @@ __global__
 static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
                                 int x_stride) {
   __shared__ Real smem[CU1DBLOCK];
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
   const int i = blockIdx.x;
   const int x_start = i * x_stride;
   const int y_start = i * y_dim.stride;
@@ -2733,23 +2691,9 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
   for (int j = tid; j < y_dim.cols; j += CU1DBLOCK) {
     tmax = fmax(tmax, x[x_start + j]);
   }
-  smem[tid] = tmax;
-  __syncthreads();
-
-  // reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      smem[tid] = fmax(smem[tid], smem[tid + shift]);
-    }
-    __syncthreads();
-  }
-
-  // reduce to 1 element per row
-  if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] = fmax(smem[tid], smem[tid + shift]);
-    }
+  tmax = BlockReduceT(temp_storage).Reduce(tmax, cub::Max());
+  if (tid == 0) {
+    smem[0] = tmax;
   }
 
   // broadcast max to all threads
@@ -2762,23 +2706,9 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
   for (int j = tid; j < y_dim.cols; j += CU1DBLOCK) {
     tsum += exp(x[x_start + j] - max);
   }
-  smem[tid] = tsum;
-  __syncthreads();
-
-  // reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      smem[tid] += smem[tid + shift];
-    }
-    __syncthreads();
-  }
-
-  // reduce to 1 element per row
-  if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] += smem[tid + shift];
-    }
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
+  if (tid == 0) {
+    smem[0] = tsum;
   }
 
   // broadcast sum to all threads
@@ -3024,6 +2954,9 @@ static void _diff_softmax(Real* x, const MatrixDim dim, const Real* value,
                           const int value_stride, const Real* diff,
                           const int diff_stride) {
   __shared__ Real ssum[CU1DBLOCK];
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
+
   const int tid = threadIdx.x;
   const int i = blockIdx.x;
   const int value_start = i * value_stride;
@@ -3035,24 +2968,9 @@ static void _diff_softmax(Real* x, const MatrixDim dim, const Real* value,
   for (int j = tid; j < dim.cols; j += CU1DBLOCK) {
     tsum += value[value_start + j] * diff[diff_start + j];
   }
-  ssum[tid] = tsum;
-  __syncthreads();
-
-  // Tree reduce to 2x warpSize elements.
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      ssum[tid] += ssum[tid + shift];
-    }
-    __syncthreads();
-  }
-
-  // Warp reduce to 1 element. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
-    }
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
+  if (tid == 0) {
+    ssum[0] = tsum;
   }
 
   // Broadcast result to all threads
@@ -3078,6 +2996,8 @@ static void _diff_log_softmax(const MatrixDim in_deriv_dim,
                               Real* in_deriv) {
 
   __shared__ Real ssum[CU1DBLOCK];
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
   const int tid = threadIdx.x;
   const int i = blockIdx.x;
   const int out_value_start = i * out_value_stride;
@@ -3089,24 +3009,9 @@ static void _diff_log_softmax(const MatrixDim in_deriv_dim,
   for (int j = tid; j < in_deriv_dim.cols; j += CU1DBLOCK) {
     tsum += out_deriv[out_deriv_start + j];
   }
-  ssum[tid] = tsum;
-  __syncthreads();
-
-  // Tree reduce to 2x warpSize elements.
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      ssum[tid] += ssum[tid + shift];
-    }
-    __syncthreads();
-  }
-
-  // Warp reduce to 1 element. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
-    }
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
+  if (tid == 0) {
+    ssum[0] = tsum;
   }
 
   // Broadcast result to all threads
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 247c2236565..1f09ff278ce 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -321,8 +321,10 @@ void CuMatrixBase<Real>::CopyFromMat(const MatrixBase<Real> &src,
       MatrixIndexT dst_pitch = stride_*sizeof(Real);
       MatrixIndexT src_pitch = src.Stride()*sizeof(Real);
       MatrixIndexT width = src.NumCols()*sizeof(Real);
-      CU_SAFE_CALL(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch,
-                                width, src.NumRows(), cudaMemcpyHostToDevice));
+      CU_SAFE_CALL(cudaMemcpy2DAsync(data_, dst_pitch, src.Data(), src_pitch,
+                                width, src.NumRows(), cudaMemcpyHostToDevice,
+                                cudaStreamPerThread));
+      cudaStreamSynchronize(cudaStreamPerThread);
 
       CuDevice::Instantiate().AccuProfile("CuMatrixBase::CopyFromMat(from CPU)", tim);
     } else {
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index d4dbdf12143..7581b043ae0 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -248,7 +248,9 @@ void CuPackedMatrix<Real>::SetZero() {
     size_t nr = static_cast<size_t>(num_rows_),
       num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
 
-    CU_SAFE_CALL(cudaMemset(reinterpret_cast<void*>(this->data_), 0, num_bytes));
+    CU_SAFE_CALL(cudaMemsetAsync(reinterpret_cast<void*>(this->data_), 0, 
+          num_bytes, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuPackedMatrix::SetZero", tim);
   } else
   #endif
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index 536e55d8a3b..7c968c6550d 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -1072,7 +1072,9 @@ void CuVectorBase<Real>::SetZero() {
     KALDI_ASSERT(dim_>=0);
     KALDI_ASSERT(data_!=NULL);
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemset(data_, 0, dim_*sizeof(Real)));
+    CU_SAFE_CALL(cudaMemsetAsync(data_, 0, dim_*sizeof(Real),
+          cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuVector::SetZero", tim);
   } else
 #endif
diff --git a/src/decoder/grammar-fst.cc b/src/decoder/grammar-fst.cc
index 6f95993d078..ab1a8142c1d 100644
--- a/src/decoder/grammar-fst.cc
+++ b/src/decoder/grammar-fst.cc
@@ -25,10 +25,10 @@ namespace fst {
 
 GrammarFst::GrammarFst(
     int32 nonterm_phones_offset,
-    const ConstFst<StdArc> &top_fst,
-    const std::vector<std::pair<Label, const ConstFst<StdArc> *> > &ifsts):
+    std::shared_ptr<const ConstFst<StdArc> > top_fst,
+    const std::vector<std::pair<Label, std::shared_ptr<const ConstFst<StdArc> > > > &ifsts):
     nonterm_phones_offset_(nonterm_phones_offset),
-    top_fst_(&top_fst),
+    top_fst_(top_fst),
     ifsts_(ifsts) {
   Init();
 }
@@ -69,11 +69,6 @@ void GrammarFst::Destroy() {
   nonterminal_map_.clear();
   entry_arcs_.clear();
   instances_.clear();
-  // the following will only do something if we read this object from disk using
-  // its Read() function.
-  for (size_t i = 0; i < fsts_to_delete_.size(); i++)
-    delete fsts_to_delete_[i];
-  fsts_to_delete_.clear();
 }
 
 
@@ -127,7 +122,7 @@ void GrammarFst::InitInstances() {
   KALDI_ASSERT(instances_.empty());
   instances_.resize(1);
   instances_[0].ifst_index = -1;
-  instances_[0].fst = top_fst_;
+  instances_[0].fst = top_fst_.get();
   instances_[0].parent_instance = -1;
   instances_[0].parent_state = -1;
 }
@@ -314,7 +309,7 @@ int32 GrammarFst::GetChildInstanceId(int32 instance_id, int32 nonterminal,
   }
   int32 ifst_index = iter->second;
   child_instance.ifst_index = ifst_index;
-  child_instance.fst = ifsts_[ifst_index].second;
+  child_instance.fst = ifsts_[ifst_index].second.get();
   child_instance.parent_instance = instance_id;
   child_instance.parent_state = state;
   InitEntryOrReentryArcs(*(parent_instance.fst), state,
@@ -429,20 +424,111 @@ void GrammarFst::Read(std::istream &is, bool binary) {
         "update your code.";
   ReadBasicType(is, binary, &num_ifsts);
   ReadBasicType(is, binary, &nonterm_phones_offset_);
-  top_fst_ = ReadConstFstFromStream(is);
-  fsts_to_delete_.push_back(top_fst_);
+  top_fst_ = std::shared_ptr<const ConstFst<StdArc> >(ReadConstFstFromStream(is));
   for (int32 i = 0; i < num_ifsts; i++) {
     int32 nonterminal;
     ReadBasicType(is, binary, &nonterminal);
-    ConstFst<StdArc> *this_fst =  ReadConstFstFromStream(is);
-    fsts_to_delete_.push_back(this_fst);
-    ifsts_.push_back(std::pair<int32, const ConstFst<StdArc>* >(nonterminal,
-                                                                this_fst));
+    std::shared_ptr<const ConstFst<StdArc> >
+        this_fst(ReadConstFstFromStream(is));
+    ifsts_.push_back(std::pair<int32, std::shared_ptr<const ConstFst<StdArc> > >(
+        nonterminal, this_fst));
   }
   Init();
 }
 
 
+/**
+   This utility function input-determinizes a specified state s of the FST
+   'fst'.   (This input-determinizes while treating epsilon as a real symbol,
+   although for the application we expect to use it, there won't be epsilons).
+
+   What this function does is: for any symbol i that appears as the ilabel of
+   more than one arc leaving state s of FST 'fst', it creates an additional
+   state, it creates a new state t with epsilon-input transitions leaving it for
+   each of those multiple arcs leaving state s; it deletes the original arcs
+   leaving state s; and it creates a single arc leaving state s to the newly
+   created state with the ilabel i on it.  It sets the weights as necessary to
+   preserve equivalence and also to ensure that if, prior to this modification,
+   the FST was stochastic when cast to the log semiring (see
+   IsStochasticInLog()), it still will be.  I.e. when interpreted as
+   negative logprobs, the weight from state s to t would be the sum of
+   the weights on the original arcs leaving state s.
+
+   This is used as a very cheap solution when preparing FSTs for the grammar
+   decoder, to ensure that there is only one entry-state to the sub-FST for each
+   phonetic left-context; this keeps the grammar-FST code (i.e. the code that
+   stitches them together) simple.  Of course it will tend to introduce
+   unnecessary epsilons, and if we were careful we might be able to remove
+   some of those, but this wouldn't have a substantial impact on overall
+   decoder performance so we don't bother.
+ */
+static void InputDeterminizeSingleState(StdArc::StateId s,
+                                        VectorFst<StdArc> *fst) {
+  bool was_input_deterministic = true;
+  typedef StdArc Arc;
+  typedef Arc::StateId StateId;
+  typedef Arc::Label Label;
+  typedef Arc::Weight Weight;
+
+  struct InfoForIlabel {
+    std::vector<size_t> arc_indexes;  // indexes of all arcs with this ilabel
+    float tot_cost;  // total cost of all arcs leaving state s for this
+                     // ilabel, summed as if they were negative log-probs.
+    StateId new_state;  // state-id of new state, if any, that we have created
+                        // to remove duplicate symbols with this ilabel.
+    InfoForIlabel(): new_state(-1) { }
+  };
+
+  std::unordered_map<Label, InfoForIlabel> label_map;
+
+  size_t arc_index = 0;
+  for (ArcIterator<VectorFst<Arc> > aiter(*fst, s);
+       !aiter.Done(); aiter.Next(), ++arc_index) {
+    const Arc &arc = aiter.Value();
+    InfoForIlabel &info = label_map[arc.ilabel];
+    if (info.arc_indexes.empty()) {
+      info.tot_cost = arc.weight.Value();
+    } else {
+      info.tot_cost = -kaldi::LogAdd(-info.tot_cost, -arc.weight.Value());
+      was_input_deterministic = false;
+    }
+    info.arc_indexes.push_back(arc_index);
+  }
+
+  if (was_input_deterministic)
+    return;  // Nothing to do.
+
+  // 'new_arcs' will contain the modified list of arcs
+  // leaving state s
+  std::vector<Arc> new_arcs;
+  new_arcs.reserve(arc_index);
+  arc_index = 0;
+  for (ArcIterator<VectorFst<Arc> > aiter(*fst, s);
+       !aiter.Done(); aiter.Next(), ++arc_index) {
+    const Arc &arc = aiter.Value();
+    Label ilabel = arc.ilabel;
+    InfoForIlabel &info = label_map[ilabel];
+    if (info.arc_indexes.size() == 1) {
+      new_arcs.push_back(arc);  // no changes needed
+    } else {
+      if (info.new_state < 0) {
+        info.new_state = fst->AddState();
+        // add arc from state 's' to newly created state.
+        new_arcs.push_back(Arc(ilabel, 0, Weight(info.tot_cost),
+                               info.new_state));
+      }
+      // add arc from new state to original destination of this arc.
+      fst->AddArc(info.new_state, Arc(0, arc.olabel,
+                                      Weight(arc.weight.Value() - info.tot_cost),
+                                      arc.nextstate));
+    }
+  }
+  fst->DeleteArcs(s);
+  for (size_t i = 0; i < new_arcs.size(); i++)
+    fst->AddArc(s, new_arcs[i]);
+}
+
+
 // This class contains the implementation of the function
 // PrepareForGrammarFst(), which is declared in grammar-fst.h.
 class GrammarFstPreparer {
@@ -475,6 +561,12 @@ class GrammarFstPreparer {
           // OK, state s is a special state.
           FixArcsToFinalStates(s);
           MaybeAddFinalProbToState(s);
+          // The following ensures that the start-state of sub-FSTs only has
+          // a single arc per left-context phone (the graph-building recipe can
+          // end up creating more than one if there were disambiguation symbols,
+          // e.g. for langauge model backoff).
+          if (s == fst_->Start() && IsEntryState(s))
+            InputDeterminizeSingleState(s, fst_);
         }
       }
     }
@@ -487,7 +579,7 @@ class GrammarFstPreparer {
 
   // Returns true if state 's' has at least one arc coming out of it with a
   // special nonterminal-related ilabel on it (i.e. an ilabel >=
-  // kNontermBigNumber)
+  // kNontermBigNumber), and false otherwise.
   bool IsSpecialState(StateId s) const;
 
   // This function verifies that state s does not currently have any
@@ -509,6 +601,10 @@ class GrammarFstPreparer {
   // modify this state (by adding input-epsilon arcs), and false otherwise.
   bool NeedEpsilons(StateId s) const;
 
+  // Returns true if state s (which is expected to be the start state, although we
+  // don't check this) has arcs with nonterminal symbols #nonterm_begin.
+  bool IsEntryState(StateId s) const;
+
   // Fixes any final-prob-related problems with this state.  The problem we aim
   // to fix is that there may be arcs with nonterminal symbol #nonterm_end which
   // transition from this state to a state with non-unit final prob.  This
@@ -599,6 +695,24 @@ bool GrammarFstPreparer::IsSpecialState(StateId s) const {
   return false;
 }
 
+bool GrammarFstPreparer::IsEntryState(StateId s) const {
+  int32 big_number = kNontermBigNumber,
+      encoding_multiple = GetEncodingMultiple(nonterm_phones_offset_);
+
+  for (ArcIterator<FST> aiter(*fst_, s ); !aiter.Done(); aiter.Next()) {
+    const Arc &arc = aiter.Value();
+    int32 nonterminal = (arc.ilabel - big_number) /
+        encoding_multiple;
+    // we check that at least one has label with nonterminal equal to #nonterm_begin...
+    // in fact they will all have this value if at least one does, and this was checked
+    // in NeedEpsilons().
+    if (nonterminal == kNontermBegin)
+      return true;
+  }
+  return false;
+}
+
+
 bool GrammarFstPreparer::NeedEpsilons(StateId s) const {
 
   // See the documentation for GetCategoryOfArc() for explanation of what these are.
@@ -647,7 +761,7 @@ bool GrammarFstPreparer::NeedEpsilons(StateId s) const {
     if (nonterminal == GetPhoneSymbolFor(kNontermBegin) &&
         s != fst_->Start()) {
       KALDI_ERR << "#nonterm_begin symbol is present but this is not the "
-          "first arc.  Did you do fstdeterminizestar while compiling?";
+          "first state.  Did you do fstdeterminizestar while compiling?";
     }
     if (nonterminal == GetPhoneSymbolFor(kNontermEnd)) {
       if (fst_->NumArcs(arc.nextstate) != 0 ||
diff --git a/src/decoder/grammar-fst.h b/src/decoder/grammar-fst.h
index f66933c132d..cfbfcad4ec6 100644
--- a/src/decoder/grammar-fst.h
+++ b/src/decoder/grammar-fst.h
@@ -88,9 +88,11 @@ template<> class ArcIterator<GrammarFst>;
    points whenever we invoke a nonterminal.  For more information
    see \ref grammar (i.e. ../doc/grammar.dox).
 
-   Caution: this class is not thread safe, i.e. you shouldn't access the same
-   GrammarFst from multiple threads.  We can fix this later if needed.
- */
+   THREAD SAFETY: you can't use this object from multiple threads; you should
+   create lightweight copies of this object using the copy constructor,
+   e.g. `new GrammarFst(this_grammar_fst)`, if you want to decode from multiple
+   threads using the same GrammarFst.
+*/
 class GrammarFst {
  public:
   typedef GrammarFstArc Arc;
@@ -136,16 +138,20 @@ class GrammarFst {
               phones.txt, i.e. the things with names like "#nonterm:foo" and
               "#nonterm:bar" in phones.txt.  Also no nonterminal may appear more
               than once in 'fsts'.  ifsts may be empty, even though that doesn't
-              make much sense.  This function does not take ownership of
-              these pointers (i.e. it will not delete them when it is destroyed).
+              make much sense.
     */
   GrammarFst(
       int32 nonterm_phones_offset,
-      const ConstFst<StdArc> &top_fst,
-      const std::vector<std::pair<int32, const ConstFst<StdArc> *> > &ifsts);
+      std::shared_ptr<const ConstFst<StdArc> > top_fst,
+      const std::vector<std::pair<int32, std::shared_ptr<const ConstFst<StdArc> > > > &ifsts);
+
+  /// Copy constructor.  Useful because this object is not thread safe so cannot
+  /// be used by multiple parallel decoder threads, but it is lightweight and
+  /// can copy it without causing the stored FSTs to be copied.
+  GrammarFst(const GrammarFst &other) = default;
 
   ///  This constructor should only be used prior to calling Read().
-  GrammarFst(): top_fst_(NULL) { }
+  GrammarFst() { }
 
   // This Write function allows you to dump a GrammarFst to disk as a single
   // object.  It only supports binary mode, but the option is allowed for
@@ -229,14 +235,15 @@ class GrammarFst {
     an arc-index leaving a particular state in an FST (i.e. an index
     that we could use to Seek() to the matching arc).
 
-      @param [in]  fst  The FST we are looking for state-indexes for
-      @param [in]  entry_state  The state in the FST-- must have arcs with
-                 ilabels decodable as (nonterminal_symbol, left_context_phone).
-                 Will either be the start state (if 'nonterminal_symbol'
-                 corresponds to #nonterm_begin), or an internal state
-                 (if 'nonterminal_symbol' corresponds to #nonterm_reenter).
-                 The arc-indexes of those arcs will be the values
-                 we set in 'phone_to_arc'
+      @param [in]  fst  The FST that is being entered (or reentered)
+      @param [in]  entry_state  The state in 'fst' which is being entered
+                 (or reentered); will be fst.Start() if it's being
+                 entered.  It must have arcs with ilabels decodable as
+                 (nonterminal_symbol, left_context_phone).  Will either be the
+                 start state (if 'nonterminal_symbol' corresponds to
+                 #nonterm_begin), or an internal state (if 'nonterminal_symbol'
+                 corresponds to #nonterm_reenter).  The arc-indexes of those
+                 arcs will be the values we set in 'phone_to_arc'
       @param [in]  nonterminal_symbol  The index in phones.txt of the
                  nonterminal symbol we expect to be encoded in the ilabels
                  of the arcs leaving 'entry_state'.  Will either correspond
@@ -448,12 +455,12 @@ class GrammarFst {
   // The top-level FST passed in by the user; contains the start state and
   // final-states, and may invoke FSTs in 'ifsts_' (which can also invoke
   // each other recursively).
-  const ConstFst<StdArc> *top_fst_;
+  std::shared_ptr<const ConstFst<StdArc> > top_fst_;
 
   // A list of pairs (nonterm, fst), where 'nonterm' is a user-defined
   // nonterminal symbol as numbered in phones.txt (e.g. #nonterm:foo), and
   // 'fst' is the corresponding FST.
-  std::vector<std::pair<int32, const ConstFst<StdArc> *> > ifsts_;
+  std::vector<std::pair<int32, std::shared_ptr<const ConstFst<StdArc> > > > ifsts_;
 
   // Maps from the user-defined nonterminals like #nonterm:foo as numbered
   // in phones.txt, to the corresponding index into 'ifsts_', i.e. the ifst_index.
@@ -473,11 +480,6 @@ class GrammarFst {
   // representing top_fst_, and it will be populated with more elements on
   // demand.  An instance_id refers to an index into this vector.
   std::vector<FstInstance> instances_;
-
-  // A list of FSTs that are to be deleted when this object is destroyed.  This
-  // will only be nonempty if we have read this object from the disk using
-  // Read().
-  std::vector<const ConstFst<StdArc> *> fsts_to_delete_;
 };
 
 
diff --git a/src/decoder/lattice-faster-decoder.h b/src/decoder/lattice-faster-decoder.h
index c611ec9dc05..5f8c0778723 100644
--- a/src/decoder/lattice-faster-decoder.h
+++ b/src/decoder/lattice-faster-decoder.h
@@ -131,12 +131,12 @@ struct StdToken {
   // to keep it in a good numerical range).
   BaseFloat tot_cost;
 
-  // exta_cost is >= 0.  After calling PruneForwardLinks, this equals
-  // the minimum difference between the cost of the best path, and the cost of
-  // this is on, and the cost of the absolute best path, under the assumption
-  // that any of the currently active states at the decoding front may
-  // eventually succeed (e.g. if you were to take the currently active states
-  // one by one and compute this difference, and then take the minimum).
+  // exta_cost is >= 0.  After calling PruneForwardLinks, this equals the
+  // minimum difference between the cost of the best path that this link is a
+  // part of, and the cost of the absolute best path, under the assumption that
+  // any of the currently active states at the decoding front may eventually
+  // succeed (e.g. if you were to take the currently active states one by one
+  // and compute this difference, and then take the minimum).
   BaseFloat extra_cost;
 
   // 'links' is the head of singly-linked list of ForwardLinks, which is what we
diff --git a/src/doc/data_prep.dox b/src/doc/data_prep.dox
index d8fe1746df1..e81032537cc 100644
--- a/src/doc/data_prep.dox
+++ b/src/doc/data_prep.dox
@@ -191,7 +191,7 @@ the speaker identities, you can just make the speaker-ids the same as the uttera
 so the format of the file would be just <DFN>\<utterance-id\> \<utterance-id\></DFN>.
 We have made the previous sentence bold because we have encountered people creating
 a "global" speaker-id.  This is a bad idea because it makes cepstral mean normalization
-ineffective in traning (since it's applied globally), and because it will create problems
+ineffective in training (since it's applied globally), and because it will create problems
 when you use utils/split_data_dir.sh to split your data into pieces.
 
 There is another file that exists in some setups; it is used only occasionally and
diff --git a/src/doc/dependencies.dox b/src/doc/dependencies.dox
index 63d2658b726..d8a5591955f 100644
--- a/src/doc/dependencies.dox
+++ b/src/doc/dependencies.dox
@@ -113,7 +113,7 @@
     - CLAPACK, the linear algebra library (we download the headers).
       This is useful only on systems where you don't have ATLAS and are
       instead compiling with CLAPACK.
-    - OpenBLAS: this is an alernative to ATLAS or CLAPACK.  The scripts don't
+    - OpenBLAS: this is an alternative to ATLAS or CLAPACK.  The scripts don't
       use it by default but we provide installation scripts so you can install
       it if you want to compare it against ATLAS (it's more actively
       maintained than ATLAS).
diff --git a/src/doc/dnn.dox b/src/doc/dnn.dox
index 5b3d2b98261..bab4658e552 100644
--- a/src/doc/dnn.dox
+++ b/src/doc/dnn.dox
@@ -37,7 +37,7 @@ namespace kaldi {
   We currently have three separate codebases for deep neural nets in Kaldi.  All
   are still active in the sense that the up-to-date recipes refer to all of
   them.  The first one ("nnet1"( is located in code subdirectories nnet/ and
-  nnetbin/, and is primiarly maintained by Karel Vesely.  The second is located
+  nnetbin/, and is primarily maintained by Karel Vesely.  The second is located
   in code subdirectories nnet2/ and nnet2bin/, and is primarily maintained by
   Daniel Povey (this code was originally based on an earlier version of Karel's
   code, but it has been extensively rewritten).  The third is located
diff --git a/src/doc/grammar.dox b/src/doc/grammar.dox
index d1c6f51f349..30396041d22 100644
--- a/src/doc/grammar.dox
+++ b/src/doc/grammar.dox
@@ -352,7 +352,7 @@ Z_S  243
   The special symbols in CLG.fst will be as follows.
 
   The following special symbols may appear in any CLG graph, top-level or not:
-   - When any graph invokes a sub-graph, there will be n arc with an ilabel
+   - When any graph invokes a sub-graph, there will be an arc with an ilabel
      (</code>\#nonterm:foo</code>, <em>left-context-phone</em>) representing the
      user-specified nonterminal and the actual left-context, which will be
      followed by arcs with ilabels of the form (</code>\#nonterm_reenter</code>,
diff --git a/src/doc/io.dox b/src/doc/io.dox
index dc958f57a6f..8f3a3cc05b6 100644
--- a/src/doc/io.dox
+++ b/src/doc/io.dox
@@ -383,7 +383,7 @@ namespace kaldi {
   std::string rspecifier2 = "ark:-"; // archive read from stdin.
   // write to a gzipped text archive.
   std::string wspecifier1 = "ark,t:| gzip -c > /some/dir/foo.ark.gz";
-  std::string wspecifier2 = "ark,scp:data/my.ark,data/my.ark";
+  std::string wspecifier2 = "ark,scp:data/my.ark,data/my.scp";
  \endcode
 
  Usually, an rspecifier or wspecifier consists of a comma-separated, unordered
@@ -401,7 +401,7 @@ namespace kaldi {
  \endverbatim
  This will write an archive, and a
  script file with lines like "utt_id /somedir/foo.ark:1234" that specify offsets into the
- archive for more efficient random access.  You can then do what you like which
+ archive for more efficient random access.  You can then do whatever you like with
  the script file, including breaking it up into segments, and it will behave like
  any other script file.  Note that although the order of options before the colon
  doesn't generally matter, in this particular case the "ark" must come before
diff --git a/src/doc/kaldi_for_dummies.dox b/src/doc/kaldi_for_dummies.dox
index c04e0d0c3e9..b48d6dd8dac 100644
--- a/src/doc/kaldi_for_dummies.dox
+++ b/src/doc/kaldi_for_dummies.dox
@@ -71,7 +71,7 @@ and installation,
  - \c awk – programming language, used for searching and processing patterns
 in files and data streams,
  - \c bash – Unix shell and script programming language,
- - \c grep – command-line utility for searching plain-text data sets for lines
+ - \c grep – command-line utility for searching plain-text datasets for lines
 matching a regular expression,
  - \c make – automatically builds executable programs and libraries from
 source code,
@@ -87,16 +87,16 @@ If you do not have much idea about how to use GIT, please read about it:
 \ref tutorial_git.
 
 I installed Kaldi in this directory (called 'Kaldi root path'):
-\c /home/{user}/kaldi-trunk
+\c /home/{user}/kaldi
 
 \section kaldi_for_dummies_directories Kaldi directories structure
 
 Try to acknowledge where particular Kaldi components are placed. Also it would
 be nice if you read any \c README files you find.
 
-\c kaldi-trunk - main Kaldi directory which contains:
+\c kaldi - main Kaldi directory which contains:
  - \c egs – example scripts allowing you to quickly build ASR
-systems for over 30 popular speech corporas (documentation is attached for each
+systems for over 30 popular speech corpora (documentation is attached for each
 project),
  - \c misc – additional tools and supplies, not needed for proper
 Kaldi functionality,
@@ -127,7 +127,7 @@ train it, test it and get some decoding results.
 
 <h2>Your first task</h2>
 Something to begin with - create a folder \c digits in
-\c kaldi-trunk/egs/ directory. This is a place where you will put all
+\c kaldi/egs/ directory. This is a place where you will put all
 the stuff related to your project.
 
 \section kaldi_for_dummies_data Data preparation
@@ -136,34 +136,34 @@ the stuff related to your project.
 
 I assume that you want to set up an ASR system, basing on your own audio data.
 For example - let it be a set of 100 files. File format is WAV. Each file
-contains 3 spoken digits recorded in english language, one by one. Each of
+contains 3 spoken digits recorded in English language, one by one. Each of
 these audio files is named in a recognizable way (e.g. \c 1_5_6.wav,
 which in my pattern means that the spoken sentence is 'one, five, six') and
 placed in the recognizable folder representing particular speaker during a
 particular recording session (there may be a situation that you have recordings
 of the same person but in two different quality/noise environments - put these
-in separate folders). So to sum up, my exemplary data set looks like this:
+in separate folders). So to sum up, my exemplary dataset looks like this:
  - 10 different speakers (ASR systems must be trained and tested on different
 speakers, the more speakers you have the better),
  - each speaker says 10 sentences,
- - 100 senteces/utterances (in 100 *.wav files placed in 10 folders related to
+ - 100 sentences/utterances (in 100 *.wav files placed in 10 folders related to
 particular speakers - 10 *.wav files in each folder),
  - 300 words (digits from zero to nine),
  - each sentence/utterance consist of 3 words.
 
-Whatever your first data set is, adjust my example to your particular case. Be
-careful with big data sets and complex grammars - start with something simple.
+Whatever your first dataset is, adjust my example to your particular case. Be
+careful with big datasets and complex grammars - start with something simple.
 Sentences that contain only digits are perfect in this case.
 
 <h2>Task</h2>
-Go to \c kaldi-trunk/egs/digits directory and create
-\c digits_audio folder. In \c kaldi-trunk/egs/digits/digits_audio
+Go to \c kaldi/egs/digits directory and create
+\c digits_audio folder. In \c kaldi/egs/digits/digits_audio
 create two folders: \c train and \c test. Select one speaker
-of your choice to represent testing data set. Use this speaker's 'speakerID' as
-a name for an another new folder in \c kaldi-trunk/egs/digits/digits_audio/test
+of your choice to represent testing dataset. Use this speaker's 'speakerID' as
+a name for an another new folder in \c kaldi/egs/digits/digits_audio/test
 directory. Then put there all the audio files related to that person. Put the
 rest (9 speakers) into \c train folder - this will be your training
-data set. Also create subfolders for each speaker.
+dataset. Also create subfolders for each speaker.
 
 \subsection kaldi_for_dummies_acoustic Acoustic data
 
@@ -174,14 +174,14 @@ section as well) can be considered as a text file with some number of strings
 (each string in a new line). These strings need to be sorted. If you will
 encounter any sorting issues you can use Kaldi scripts for checking
 (\c utils/validate_data_dir.sh) and fixing (\c utils/fix_data_dir.sh) data order.
-And for you information - \c utils directory will be attached to your project in
+And for your information - \c utils directory will be attached to your project in
 \ref kaldi_for_dummies_tools "Tools attachment" section.
 
 <h2>Task</h2>
-In \c kaldi-trunk/egs/digits directory, create a folder \c data. Then create
+In \c kaldi/egs/digits directory, create a folder \c data. Then create
 \c test and \c train subfolders inside. Create in each subfolder following files
 (so you have files named in <b>the same way in \c test and \c train subfolders
-but they relate to two different data sets</b> that you created before):
+but they relate to two different datasets</b> that you created before):
 
 a.) \c spk2gender <br>
 This file informs about speakers gender. As we assumed, 'speakerID' is a unique
@@ -207,9 +207,9 @@ for examples below).
 
 <b>Pattern:</b> <uterranceID> <full_path_to_audio_file>
 \verbatim
-dad_4_4_2 /home/{user}/kaldi-trunk/egs/digits/digits_audio/train/dad/4_4_2.wav
-july_1_2_5 /home/{user}/kaldi-trunk/egs/digits/digits_audio/train/july/1_2_5.wav
-july_6_8_3 /home/{user}/kaldi-trunk/egs/digits/digits_audio/train/july/6_8_3.wav
+dad_4_4_2 /home/{user}/kaldi/egs/digits/digits_audio/train/dad/4_4_2.wav
+july_1_2_5 /home/{user}/kaldi/egs/digits/digits_audio/train/july/1_2_5.wav
+july_6_8_3 /home/{user}/kaldi/egs/digits/digits_audio/train/july/6_8_3.wav
 # and so on...
 \endverbatim
 
@@ -236,8 +236,8 @@ july_6_8_3 july
 \endverbatim
 
 e.) \c corpus.txt <br>
-This file has a slightly different directory. In \c kaldi-trunk/egs/digits/data
-create another folder \c local. In \c kaldi-trunk/egs/digits/data/local create a
+This file has a slightly different directory. In \c kaldi/egs/digits/data
+create another folder \c local. In \c kaldi/egs/digits/data/local create a
 file \c corpus.txt which should contain every single utterance transcription
 that can occur in your ASR system (in our case it will be 100 lines from 100
 audio files).
@@ -252,14 +252,14 @@ four four two
 
 \subsection kaldi_for_dummies_language Language data
 
-This section relates to language modelling files that also need to be considered
+This section relates to language modeling files that also need to be considered
 as 'must be done'. Look for the syntax details here: \ref data_prep (each file
 is precisely described). Also feel free to read some examples in other \c egs
 scripts. Now is the perfect time.
 
 <h2>Task</h2>
-In \c kaldi-trunk/egs/digits/data/local directory, create a folder \c dict. In
-\c kaldi-trunk/egs/digits/data/local/dict create following files:
+In \c kaldi/egs/digits/data/local directory, create a folder \c dict. In
+\c kaldi/egs/digits/data/local/dict create following files:
 
 a.) \c lexicon.txt <br>
 This file contains every word from your dictionary with its 'phone
@@ -337,19 +337,19 @@ complete.
 You need to add necessary Kaldi tools that are widely used in exemplary scripts.
 
 <h2>Task</h2>
-From \c kaldi-trunk/egs/wsj/s5 copy two folders (with the whole content) -
+From \c kaldi/egs/wsj/s5 copy two folders (with the whole content) -
 \c utils and \c steps - and put them in your
-\c kaldi-trunk/egs/digits directory. You can also create links to these
+\c kaldi/egs/digits directory. You can also create links to these
 directories. You may find such links in, for example,
-\c kaldi-trunk/egs/voxforge/s5.
+\c kaldi/egs/voxforge/s5.
 
 \subsection kaldi_for_dummies_scoring Scoring script
 
 This script will help you to get decoding results.
 
 <h2>Task</h2>
-From \c kaldi-trunk/egs/voxforge/s5/local copy the script \c score.sh into
-similar location in your project (\c kaldi-trunk/egs/digits/local).
+From \c kaldi/egs/voxforge/s5/local copy the script \c score.sh into
+similar location in your project (\c kaldi/egs/digits/local).
 
 \subsection kaldi_for_dummies_srilm SRILM installation
 
@@ -358,7 +358,7 @@ example - SRI Language Modeling Toolkit (SRILM).
 
 <h2>Task</h2>
 For detailed installation instructions go to
-\c kaldi-trunk/tools/install_srilm.sh (read all comments inside).
+\c kaldi/tools/install_srilm.sh (read all comments inside).
 
 \subsection kaldi_for_dummies_configuration Configuration files
 
@@ -366,8 +366,8 @@ It is not necessary to create configuration files but it can be a good habit
 for future.
 
 <h2>Task</h2>
-In \c kaldi-trunk/egs/digits create a folder \c conf. Inside
-\c kaldi-trunk/egs/digits/conf create two files (for some configuration
+In \c kaldi/egs/digits create a folder \c conf. Inside
+\c kaldi/egs/digits/conf create two files (for some configuration
 modifications in decoding and mfcc feature extraction processes - taken from
 \c /egs/voxforge):
 
@@ -395,10 +395,10 @@ decided to use two different training methods:
 - TRI1 - simple triphone training (first triphone pass).
 
 These two methods are enough to show noticable differences in decoding results
-using only digits lexicon and small training data set.
+using only digits lexicon and small training dataset.
 
 <h2>Task</h2>
-In \c kaldi-trunk/egs/digits directory create 3 scripts:
+In \c kaldi/egs/digits directory create 3 scripts:
 
 a.) \c cmd.sh <br>
 \code{.sh}
@@ -416,7 +416,7 @@ export KALDI_ROOT=`pwd`/../..
 export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PATH
 
 # Defining audio data directory (modify it for your installation directory!)
-export DATA_ROOT="/home/{user}/kaldi-trunk/egs/digits/digits_audio"
+export DATA_ROOT="/home/{user}/kaldi/egs/digits/digits_audio"
 
 # Enable SRILM
 . $KALDI_ROOT/tools/env.sh
@@ -432,7 +432,7 @@ c.) \c run.sh
 . ./path.sh || exit 1
 . ./cmd.sh || exit 1
 
-nj=1       # number of parallel jobs - 1 is perfect for such a small data set
+nj=1       # number of parallel jobs - 1 is perfect for such a small dataset
 lm_order=1 # language model order (n-gram quantity) - 1 is enough for digits grammar
 
 # Safety mechanism (possible running this script with modified arguments)
@@ -564,7 +564,7 @@ Now all you have to do is to run \c run.sh script. If I have made any mistakes
 in this tutorial, logs from the terminal should guide you how to deal with it.
 
 Besides the fact that you will notice some decoding results in the terminal
-window, go to newly made \c kaldi-trunk/egs/digits/exp. You may notice there
+window, go to newly made \c kaldi/egs/digits/exp. You may notice there
 folders with \c mono and \c tri1 results as well - directories structure are the
 same. Go to \c mono/decode directory. Here you may find result files (named in
 a <c>wer_{number}</c> way). Logs for decoding process may be found in \c log
@@ -575,7 +575,7 @@ folder (same directory).
 This is just an example. The point of this short tutorial is to show you how to
 create 'anything' in Kaldi and to get a better understanding of how to think
 while using this toolkit. Personally I started with looking for tutorials made
-by the Kaldi authors/developers. After succesful Kaldi installation I launched
+by the Kaldi authors/developers. After successful Kaldi installation I launched
 some example scripts (Yesno, Voxforge, LibriSpeech - they are relatively easy
 and have free acoustic/language data to download - I used these three as a base
 for my own scripts).
@@ -586,7 +586,7 @@ There are two very useful sections for beginners inside: <br>
 a.) \ref tutorial - almost 'step by step' tutorial on how to set up an ASR
 system; up to some point this can be done without RM dataset. It is good to
 read it, <br>
-b.) \ref data_prep - very detailed explaination of how to use your own data
+b.) \ref data_prep - very detailed explanation of how to use your own data
 in Kaldi.
 
 More useful links about Kaldi I found: <br>
diff --git a/src/doc/tutorial_looking.dox b/src/doc/tutorial_looking.dox
index 420abfc9bce..831d721c7eb 100644
--- a/src/doc/tutorial_looking.dox
+++ b/src/doc/tutorial_looking.dox
@@ -171,7 +171,7 @@ making sure have their normal values, begin with KALDI_.  This is a precaution
 to avoid future conflicts with other codebases (since \#defines don't limit themselves
 to the kaldi namespace).  Notice the style of the function names: LikeThis().
 Our style is generally based on
-<a href=http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml> this one </a>,
+<a href=https://google.github.io/styleguide/cppguide.html> this one </a>,
 to conform with OpenFst, but there are some differences.
 
 To see other elements of the style, which will help you to understand Kaldi
@@ -190,7 +190,7 @@ It prints out the usage, which should give you a generic idea of how Kaldi progr
 are called.  Note that while there is a --config option that can be used to
 pass a configuration file, in general Kaldi is not as config-driven as HTK and these
 files are not widely used.  You will see a --binary option.  In general, Kaldi file
-formats come in both binary and test forms, and the --binary option controls how
+formats come in both binary and text forms, and the --binary option controls how
 they are written.  However, this only controls how single objects (e.g. acoustic models)
 are written.  For whole collections of objects (e.g. collections of feature files),
 there is a different mechanism that we will come to later.
diff --git a/src/doc/tutorial_prereqs.dox b/src/doc/tutorial_prereqs.dox
index 82079a281b9..72b1fcf8ad8 100644
--- a/src/doc/tutorial_prereqs.dox
+++ b/src/doc/tutorial_prereqs.dox
@@ -51,7 +51,7 @@
   The most difficult part of the installation process relates to the math library
   ATLAS; if this is not already installed as a library on your system you will
   have to compile it, and this requires that CPU throttling be turned off, which
-  may require root priveleges.  We provide scripts and detailed instructions for
+  may require root privileges.  We provide scripts and detailed instructions for
   all installation steps.  When scripts fail, read the output carefully because
   it tries to provide guidance as to how to fix problems.  Please inform us if there
   are problems at any point, however minor; see \ref other.
diff --git a/src/doc/tutorial_running.dox b/src/doc/tutorial_running.dox
index f977348a3cb..d639cd4e664 100644
--- a/src/doc/tutorial_running.dox
+++ b/src/doc/tutorial_running.dox
@@ -115,14 +115,14 @@ Now go back to the data directory and change directory to /train. Then execute t
 
 \verbatim
 head text
-head spk2gender.map
+head spk2gender
 head spk2utt
 head utt2spk
 head wav.scp
 \endverbatim
 
 - text - This file contains mappings between utterances and utterance ids which will be used by Kaldi. This file will be turned into an integer format-- still a text file, but with the words replaced with integers.
-- spk2gender.map - This file contains mappings between speakers and their gender. This also acts as a list of unique users involved in training. 
+- spk2gender - This file contains mappings between speakers and their gender. This also acts as a list of unique users involved in training. 
 - spk2utt - This is a mapping between the speaker identifiers and all the utterance identifiers associated with the speaker. 
 - utt2spk - This is a one-to-one mapping between utterance ids and the corresponding speaker identifiers. 
 - wav.scp - This file is actually read directly by Kaldi programs when doing feature extraction. Look at the file again. It is parsed as a set of key-value pairs, where the key is the first string on each line. The value is a kind of "extended filename", and you can guess how it works. Since it is for reading we will refer to this type of string as an "rxfilename" (for writing we use the term wxfilename). See \ref io_sec_xfilename if you are curious. Note that although we use the extension .scp, this is not a script file in the HTK sense (i.e. it is not viewed as an extension to the command-line arguments).
@@ -383,7 +383,7 @@ do
 copy-tree --binary=false exp/mono/tree - | less
 \endverbatim
 Note that this is a monophone "tree" so it is very trivial-- it
-does not have any "splits".  Although this tree format was not indended to be
+does not have any "splits".  Although this tree format was not intended to be
 very human-readable, we have received a number of queries about the tree format so we
 will explain it.  The rest of this paragraph can be skipped over by the casual reader.
 After "ToPdf", the tree file contains an object of the
@@ -442,7 +442,7 @@ Type
 \verbatim
 grep Overall exp/mono/log/acc.{?,??}.{?,??}.log
 \endverbatim
-You can see the acoustic likelihods on each iteration.  Next look at one of the files
+You can see the acoustic likelihoods on each iteration.  Next look at one of the files
 exp/mono/log/update.*.log to see what kind of information is in the update log.
 
 When the monophone training is finished, we can test the monophone decoding. Before decoding, we have to create the decode graph. Type:
@@ -505,7 +505,7 @@ gmm-decode-faster
 \endverbatim
 to see the usage message, and match up the arguments with what you see in the log file.
 Recall that "rspecifier" is one of those strings that specifies how to read a table,
-and "wspecifier" specifies how to write one.  Look carefuly at these arguments and try
+and "wspecifier" specifies how to write one.  Look carefully at these arguments and try
 to figure out what they mean.  Look at the rspecifier that corresponds to the features, and
 try to understand it (this one has spaces inside, so Kaldi prints it out with single quotes
 around it so that you could paste it into the shell and the program would run as intended).
diff --git a/src/doc/tutorial_setup.dox b/src/doc/tutorial_setup.dox
index 11d97a945f9..13f5e3e9c74 100644
--- a/src/doc/tutorial_setup.dox
+++ b/src/doc/tutorial_setup.dox
@@ -34,16 +34,16 @@
 
   Assuming Git is installed, to get the latest code you can type
   \verbatim
-    git clone https://github.com/kaldi-asr/kaldi.git kaldi-trunk --origin golden
+    git clone https://github.com/kaldi-asr/kaldi.git
   \endverbatim
-  Then cd to kaldi-trunk.  Look at the INSTALL file and follow the instructions
+  Then cd to kaldi.  Look at the INSTALL file and follow the instructions
   (it points you to two subdirectories).  Look carefully at the output of the
   installation scripts, as they try to guide you what to do.  Some installation
   errors are non-fatal, and the installation scripts will tell you so (i.e. there
   are some things it installs which are nice to have but are not really needed).
   The "best-case" scenario is that you do:
  \verbatim
-   cd kaldi-trunk/tools/; make; cd ../src; ./configure; make
+   cd kaldi/tools/; make; cd ../src; ./configure; make
  \endverbatim
  and everything will just work; however, if this does not happen there are
  fallback plans (e.g. you may have to install some package on your machine, or run
diff --git a/src/doc/versions.dox b/src/doc/versions.dox
index b26978b6e4d..08e2c2bbda7 100644
--- a/src/doc/versions.dox
+++ b/src/doc/versions.dox
@@ -28,7 +28,7 @@
 
    \section versions_scheme Versioning scheme
 
-     During its lifetime, Kaldi has has three different versioning methods.
+     During its lifetime, Kaldi has three different versioning methods.
      Originally Kaldi was a subversion (svn)-based project, and was hosted
      on Sourceforge.  Then Kaldi was moved to github, and for some time the
      only version-number available was the git hash of the commit.
@@ -121,7 +121,7 @@
       - Create a nnet3-based setup for RNN language models (i.e. recurrent and neural net based
         language models)
       - Some extentions to the core of the nnet3 framework to support constant values and
-        scalar multiplication without dedicated compoennts.
+        scalar multiplication without dedicated components.
 
    Below are commits corresponding to minor version numbers 5.3.x.
 
diff --git a/src/feat/feature-common-inl.h b/src/feat/feature-common-inl.h
index b9c5794a629..26127a4dc4d 100644
--- a/src/feat/feature-common-inl.h
+++ b/src/feat/feature-common-inl.h
@@ -33,26 +33,26 @@ void OfflineFeatureTpl<F>::ComputeFeatures(
     Matrix<BaseFloat> *output) {
   KALDI_ASSERT(output != NULL);
   BaseFloat new_sample_freq = computer_.GetFrameOptions().samp_freq;
-  if (sample_freq == new_sample_freq)
+  if (sample_freq == new_sample_freq) {
     Compute(wave, vtln_warp, output);
-  else {
-    if (new_sample_freq < sample_freq) {
-      if (! computer_.GetFrameOptions().allow_downsample)
+  } else {
+    if (new_sample_freq < sample_freq &&
+        ! computer_.GetFrameOptions().allow_downsample)
         KALDI_ERR << "Waveform and config sample Frequency mismatch: "
                   << sample_freq << " .vs " << new_sample_freq
-                  << " ( use --allow_downsample=true option to allow "
+                  << " (use --allow-downsample=true to allow "
                   << " downsampling the waveform).";
-
-      // Downsample the waveform.
-      Vector<BaseFloat> downsampled_wave(wave);
-      DownsampleWaveForm(sample_freq, wave,
-                         new_sample_freq, &downsampled_wave);
-      Compute(downsampled_wave, vtln_warp, output);
-    } else
-      KALDI_ERR << "New sample Frequency " << new_sample_freq
-                << " is larger than waveform original sampling frequency "
-                << sample_freq;
-
+    else if (new_sample_freq > sample_freq &&
+             ! computer_.GetFrameOptions().allow_upsample)
+      KALDI_ERR << "Waveform and config sample Frequency mismatch: "
+                  << sample_freq << " .vs " << new_sample_freq
+                << " (use --allow-upsample=true option to allow "
+                << " upsampling the waveform).";
+    // Resample the waveform.
+    Vector<BaseFloat> resampled_wave(wave);
+    ResampleWaveform(sample_freq, wave,
+                     new_sample_freq, &resampled_wave);
+    Compute(resampled_wave, vtln_warp, output);
   }
 }
 
diff --git a/src/feat/feature-window.h b/src/feat/feature-window.h
index c249414259c..c9172521d7c 100644
--- a/src/feat/feature-window.h
+++ b/src/feat/feature-window.h
@@ -40,14 +40,16 @@ struct FrameExtractionOptions {
   BaseFloat preemph_coeff;  // Preemphasis coefficient.
   bool remove_dc_offset;  // Subtract mean of wave before FFT.
   std::string window_type;  // e.g. Hamming window
-  bool round_to_power_of_two;
-  BaseFloat blackman_coeff;
-  bool snip_edges;
-  bool allow_downsample;
   // May be "hamming", "rectangular", "povey", "hanning", "blackman"
   // "povey" is a window I made to be similar to Hamming but to go to zero at the
   // edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85)
   // I just don't think the Hamming window makes sense as a windowing function.
+  bool round_to_power_of_two;
+  BaseFloat blackman_coeff;
+  bool snip_edges;
+  bool allow_downsample;
+  bool allow_upsample;
+  int max_feature_vectors;
   FrameExtractionOptions():
       samp_freq(16000),
       frame_shift_ms(10.0),
@@ -59,7 +61,10 @@ struct FrameExtractionOptions {
       round_to_power_of_two(true),
       blackman_coeff(0.42),
       snip_edges(true),
-      allow_downsample(false) { }
+      allow_downsample(false),
+      allow_upsample(false),
+      max_feature_vectors(-1)
+      { }
 
   void Register(OptionsItf *opts) {
     opts->Register("sample-frequency", &samp_freq,
@@ -90,6 +95,13 @@ struct FrameExtractionOptions {
     opts->Register("allow-downsample", &allow_downsample,
                    "If true, allow the input waveform to have a higher frequency than "
                    "the specified --sample-frequency (and we'll downsample).");
+    opts->Register("max-feature-vectors", &max_feature_vectors,
+                   "Memory optimization. If larger than 0, periodically remove feature "
+                   "vectors so that only this number of the latest feature vectors is "
+                   "retained.");
+    opts->Register("allow-upsample", &allow_upsample,
+                   "If true, allow the input waveform to have a lower frequency than "
+                   "the specified --sample-frequency (and we'll upsample).");
   }
   int32 WindowShift() const {
     return static_cast<int32>(samp_freq * 0.001 * frame_shift_ms);
diff --git a/src/feat/mel-computations.h b/src/feat/mel-computations.h
index 5df36c8cb90..7053da54f3a 100644
--- a/src/feat/mel-computations.h
+++ b/src/feat/mel-computations.h
@@ -63,7 +63,7 @@ struct MelBanksOptions {
     opts->Register("low-freq", &low_freq,
                    "Low cutoff frequency for mel bins");
     opts->Register("high-freq", &high_freq,
-                   "High cutoff frequency for mel bins (if < 0, offset from Nyquist)");
+                   "High cutoff frequency for mel bins (if <= 0, offset from Nyquist)");
     opts->Register("vtln-low", &vtln_low,
                    "Low inflection point in piecewise linear VTLN warping function");
     opts->Register("vtln-high", &vtln_high,
diff --git a/src/feat/online-feature-test.cc b/src/feat/online-feature-test.cc
index e3a1d5f99f3..7ba6c7c32be 100644
--- a/src/feat/online-feature-test.cc
+++ b/src/feat/online-feature-test.cc
@@ -375,6 +375,45 @@ void TestOnlineAppendFeature() {
   }
 }
 
+void TestRecyclingVector() {
+  RecyclingVector full_vec;
+  RecyclingVector shrinking_vec(10);
+  for (int i = 0; i != 100; ++i) {
+    Vector <BaseFloat> data(1);
+    data.Set(i);
+    full_vec.PushBack(new Vector<BaseFloat>(data));
+    shrinking_vec.PushBack(new Vector<BaseFloat>(data));
+  }
+  KALDI_ASSERT(full_vec.Size() == 100);
+  KALDI_ASSERT(shrinking_vec.Size() == 100);
+
+  // full_vec should contain everything
+  for (int i = 0; i != 100; ++i) {
+    Vector <BaseFloat> *data = full_vec.At(i);
+    KALDI_ASSERT(data != nullptr);
+    KALDI_ASSERT((*data)(0) == static_cast<BaseFloat>(i));
+  }
+
+  // shrinking_vec may throw an exception for the first 90 elements
+  int caught_exceptions = 0;
+  for (int i = 0; i != 90; ++i) {
+    try {
+      shrinking_vec.At(i);
+    } catch (const std::runtime_error &) {
+      ++caught_exceptions;
+    }
+  }
+  // it may actually store a bit more elements for performance efficiency considerations
+  KALDI_ASSERT(caught_exceptions >= 80);
+
+  // shrinking_vec should contain the last 10 elements
+  for (int i = 90; i != 100; ++i) {
+    Vector <BaseFloat> *data = shrinking_vec.At(i);
+    KALDI_ASSERT(data != nullptr);
+    KALDI_ASSERT((*data)(0) == static_cast<BaseFloat>(i));
+  }
+}
+
 }  // end namespace kaldi
 
 int main() {
@@ -387,6 +426,7 @@ int main() {
     TestOnlinePlp();
     TestOnlineTransform();
     TestOnlineAppendFeature();
+    TestRecyclingVector();
   }
   std::cout << "Test OK.\n";
 }
diff --git a/src/feat/online-feature.cc b/src/feat/online-feature.cc
index 88d21473b9f..90170a266e5 100644
--- a/src/feat/online-feature.cc
+++ b/src/feat/online-feature.cc
@@ -24,17 +24,53 @@
 
 namespace kaldi {
 
+RecyclingVector::RecyclingVector(int items_to_hold) :
+  items_to_hold_(items_to_hold == 0 ? -1 : items_to_hold),
+  first_available_index_(0) {
+}
+
+RecyclingVector::~RecyclingVector() {
+  for (auto *item : items_) {
+    delete item;
+  }
+}
+
+Vector<BaseFloat> *RecyclingVector::At(int index) const {
+  if (index < first_available_index_) {
+    KALDI_ERR << "Attempted to retrieve feature vector that was "
+                 "already removed by the RecyclingVector (index = " << index << "; "
+              << "first_available_index = " << first_available_index_ << "; "
+              << "size = " << Size() << ")";
+  }
+  // 'at' does size checking.
+  return items_.at(index - first_available_index_);
+}
+
+void RecyclingVector::PushBack(Vector<BaseFloat> *item) {
+  if (items_.size() == items_to_hold_) {
+    delete items_.front();
+    items_.pop_front();
+    ++first_available_index_;
+  }
+  items_.push_back(item);
+}
+
+int RecyclingVector::Size() const {
+  return first_available_index_ + items_.size();
+}
+
+
 template<class C>
 void OnlineGenericBaseFeature<C>::GetFrame(int32 frame,
                                            VectorBase<BaseFloat> *feat) {
-  // 'at' does size checking.
-  feat->CopyFromVec(*(features_.at(frame)));
+  feat->CopyFromVec(*(features_.At(frame)));
 };
 
 template<class C>
 OnlineGenericBaseFeature<C>::OnlineGenericBaseFeature(
     const typename C::Options &opts):
     computer_(opts), window_function_(computer_.GetFrameOptions()),
+    features_(opts.frame_opts.max_feature_vectors),
     input_finished_(false), waveform_offset_(0) { }
 
 template<class C>
@@ -63,11 +99,10 @@ template<class C>
 void OnlineGenericBaseFeature<C>::ComputeFeatures() {
   const FrameExtractionOptions &frame_opts = computer_.GetFrameOptions();
   int64 num_samples_total = waveform_offset_ + waveform_remainder_.Dim();
-  int32 num_frames_old = features_.size(),
+  int32 num_frames_old = features_.Size(),
       num_frames_new = NumFrames(num_samples_total, frame_opts,
                                  input_finished_);
   KALDI_ASSERT(num_frames_new >= num_frames_old);
-  features_.resize(num_frames_new, NULL);
 
   Vector<BaseFloat> window;
   bool need_raw_log_energy = computer_.NeedRawLogEnergy();
@@ -81,7 +116,7 @@ void OnlineGenericBaseFeature<C>::ComputeFeatures() {
     // note: this online feature-extraction code does not support VTLN.
     BaseFloat vtln_warp = 1.0;
     computer_.Compute(raw_log_energy, vtln_warp, &window, this_feature);
-    features_[frame] = this_feature;
+    features_.PushBack(this_feature);
   }
   // OK, we will now discard any portion of the signal that will not be
   // necessary to compute frames in the future.
diff --git a/src/feat/online-feature.h b/src/feat/online-feature.h
index d41bb6747c7..d47a6b13e9b 100644
--- a/src/feat/online-feature.h
+++ b/src/feat/online-feature.h
@@ -41,6 +41,36 @@ namespace kaldi {
 /// @{
 
 
+/// This class serves as a storage for feature vectors with an option to limit
+/// the memory usage by removing old elements. The deleted frames indices are
+/// "remembered" so that regardless of the MAX_ITEMS setting, the user always
+/// provides the indices as if no deletion was being performed.
+/// This is useful when processing very long recordings which would otherwise
+/// cause the memory to eventually blow up when the features are not being removed.
+class RecyclingVector {
+public:
+  /// By default it does not remove any elements.
+  RecyclingVector(int items_to_hold = -1);
+
+  /// The ownership is being retained by this collection - do not delete the item.
+  Vector<BaseFloat> *At(int index) const;
+
+  /// The ownership of the item is passed to this collection - do not delete the item.
+  void PushBack(Vector<BaseFloat> *item);
+
+  /// This method returns the size as if no "recycling" had happened,
+  /// i.e. equivalent to the number of times the PushBack method has been called.
+  int Size() const;
+
+  ~RecyclingVector();
+
+private:
+  std::deque<Vector<BaseFloat>*> items_;
+  int items_to_hold_;
+  int first_available_index_;
+};
+
+
 /// This is a templated class for online feature extraction;
 /// it's templated on a class like MfccComputer or PlpComputer
 /// that does the basic feature extraction.
@@ -61,7 +91,7 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature {
     return computer_.GetFrameOptions().frame_shift_ms / 1000.0f;
   }
 
-  virtual int32 NumFramesReady() const { return features_.size(); }
+  virtual int32 NumFramesReady() const { return features_.Size(); }
 
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
 
@@ -88,10 +118,6 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature {
     ComputeFeatures();
   }
 
-  ~OnlineGenericBaseFeature() {
-    DeletePointers(&features_);
-  }
-
  private:
   // This function computes any additional feature frames that it is possible to
   // compute from 'waveform_remainder_', which at this point may contain more
@@ -107,7 +133,7 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature {
 
   // features_ is the Mfcc or Plp or Fbank features that we have already computed.
 
-  std::vector<Vector<BaseFloat>*> features_;
+  RecyclingVector features_;
 
   // True if the user has called "InputFinished()"
   bool input_finished_;
diff --git a/src/feat/pitch-functions-test.cc b/src/feat/pitch-functions-test.cc
index 098e590a8e9..0e481c18674 100644
--- a/src/feat/pitch-functions-test.cc
+++ b/src/feat/pitch-functions-test.cc
@@ -449,7 +449,7 @@ static void UnitTestKeeleNccfBallast() {
       // use pitch code with default configuration..
       PitchExtractionOptions op;
       op.nccf_ballast = 0.05 * k;
-      KALDI_LOG << " nccf_ballast " << op.nccf_ballast << std::endl;
+      KALDI_LOG << " nccf_ballast " << op.nccf_ballast;
       // compute pitch.
       Matrix<BaseFloat> m;
       ComputeKaldiPitch(op, waveform, &m);
@@ -493,7 +493,7 @@ static void UnitTestPitchExtractionSpeed() {
     double tot_time = timer.Elapsed(),
         speech_time = test_num * waveform.Dim() / wave.SampFreq();
     KALDI_LOG << " Pitch extraction time per second of speech is "
-              << (tot_time / speech_time) << " seconds " << std::endl;
+              << (tot_time / speech_time) << " seconds.";
   }
 }
 static void UnitTestPitchExtractorCompareKeele() {
diff --git a/src/feat/resample.cc b/src/feat/resample.cc
index 518685d85c8..11f4c62bf1c 100644
--- a/src/feat/resample.cc
+++ b/src/feat/resample.cc
@@ -302,7 +302,7 @@ void ArbitraryResample::Resample(const VectorBase<BaseFloat> &input,
                                  VectorBase<BaseFloat> *output) const {
   KALDI_ASSERT(input.Dim() == num_samples_in_ &&
                output->Dim() == weights_.size());
-  
+
   int32 output_dim = output->Dim();
   for (int32 i = 0; i < output_dim; i++) {
     SubVector<BaseFloat> input_part(input, first_index_[i], weights_[i].Dim());
@@ -365,13 +365,13 @@ BaseFloat ArbitraryResample::FilterFunc(BaseFloat t) const {
   return filter * window;
 }
 
-void DownsampleWaveForm(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
-                        BaseFloat new_freq, Vector<BaseFloat> *new_wave) {
-  KALDI_ASSERT(new_freq < orig_freq);
-  BaseFloat lowpass_cutoff = 0.99 * 0.5 * new_freq;
+void ResampleWaveform(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
+                      BaseFloat new_freq, Vector<BaseFloat> *new_wave) {
+  BaseFloat min_freq = std::min(orig_freq, new_freq);
+  BaseFloat lowpass_cutoff = 0.99 * 0.5 * min_freq;
   int32 lowpass_filter_width = 6;
-  LinearResample signal_downsampler(orig_freq, new_freq,
-                                    lowpass_cutoff, lowpass_filter_width);
-  signal_downsampler.Resample(wave, true, new_wave);
+  LinearResample resampler(orig_freq, new_freq,
+                           lowpass_cutoff, lowpass_filter_width);
+  resampler.Resample(wave, true, new_wave);
 }
 }  // namespace kaldi
diff --git a/src/feat/resample.h b/src/feat/resample.h
index cc3e5064863..ecac2ba7566 100644
--- a/src/feat/resample.h
+++ b/src/feat/resample.h
@@ -40,7 +40,7 @@ namespace kaldi {
 
 /**
    \file[resample.h]
-   
+
    This header contains declarations of classes for resampling signals.  The
    normal cases of resampling a signal are upsampling and downsampling
    (increasing and decreasing the sample rate of a signal, respectively),
@@ -51,7 +51,7 @@ namespace kaldi {
    The input signal is always evenly spaced, say sampled with frequency S, and
    we assume the original signal was band-limited to S/2 or lower.  The n'th
    input sample x_n (with n = 0, 1, ...) is interpreted as the original
-   signal's value at time n/S.  
+   signal's value at time n/S.
 
    For resampling, it is convenient to view the input signal as a
    continuous function x(t) of t, where each sample x_n becomes a delta function
@@ -73,14 +73,14 @@ namespace kaldi {
    means we window the sinc function out to its first zero on the left and right,
    w = 2 means the second zero, and so on; we normally choose w to be at least two.
    We call this num_zeros, not w, in the code.
-   
+
    Convolving the signal x(t) with this windowed filter h(t) = f(t)g(t) and evaluating the resulting
    signal s(t) at an arbitrary time t is easy: we have
     \f[          s(t) = 1/S \sum_n x_n h(t - n/S)        \f].
    (note: the sign of t - n/S might be wrong, but it doesn't matter as the filter
    and window are symmetric).
    This is true for arbitrary values of t.  What the class ArbitraryResample does
-   is to allow you to evaluate the signal for specified values of t.  
+   is to allow you to evaluate the signal for specified values of t.
 */
 
 
@@ -90,7 +90,7 @@ namespace kaldi {
    don't have to be linearly spaced.  The low-pass filter cutoff
    "filter_cutoff_hz" should be less than half the sample rate;
    "num_zeros" should probably be at least two preferably more; higher numbers give
-   sharper filters but will be less efficient. 
+   sharper filters but will be less efficient.
 */
 class ArbitraryResample {
  public:
@@ -115,7 +115,7 @@ class ArbitraryResample {
   /// This version of the Resample function processes just
   /// one vector.
   void Resample(const VectorBase<BaseFloat> &input,
-                VectorBase<BaseFloat> *output) const;  
+                VectorBase<BaseFloat> *output) const;
  private:
   void SetIndexes(const Vector<BaseFloat> &sample_points);
 
@@ -248,20 +248,35 @@ class LinearResample {
                                        ///< previously seen input signal.
 };
 
-/// Downsample a waveform. This is a convenience wrapper for the
-/// class 'LinearResample'.
-/// The low-pass filter cutoff used in 'LinearResample' is 0.99 of half of the
-/// new_freq and num_zeros is 6.
-/// The downsampling results is also checked wit sox resampling toolkit.
-/// Sox design is inspired by Laurent De Soras' paper,
-/// https://ccrma.stanford.edu/~jos/resample/Implementation.html
-/// It designs low pass filter using pass-band, stop-band, Nyquist freq
-/// and stop-band attenuation.
-/// e.g. The mainlob for Hanning window is 4pi/M, where the main-lobe width is
-/// equal to (pass-band-freq - stop-band-freq).
-/// Also the cutoff frequency is equal to (pass-band-freq - stop-band-freq).
-void DownsampleWaveForm(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
-                        BaseFloat new_freq, Vector<BaseFloat> *new_wave);
+/**
+   Downsample or upsample a waveform. This is a convenience wrapper for the
+   class 'LinearResample'.
+   The low-pass filter cutoff used in 'LinearResample' is 0.99 of the Nyquist,
+   where the Nyquist is half of the minimum of (orig_freq, new_freq).  The
+   resampling is done with a symmetric FIR filter with N_z (number of zeros)
+   as 6.
+
+   We compared the downsampling results with those from the sox resampling
+   toolkit.
+   Sox's design is inspired by Laurent De Soras' paper,
+   https://ccrma.stanford.edu/~jos/resample/Implementation.html
+
+   Note: we expect that while orig_freq and new_freq are of type BaseFloat, they
+   are actually required to have exact integer values (like 16000 or 8000) with
+   a ratio between them that can be expressed as a rational number with
+   reasonably small integer factors.
+*/
+void ResampleWaveform(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
+                      BaseFloat new_freq, Vector<BaseFloat> *new_wave);
+
+
+/// This function is deprecated.  It is provided for backward compatibility, to avoid
+/// breaking older code.
+inline void DownsampleWaveForm(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
+                               BaseFloat new_freq, Vector<BaseFloat> *new_wave) {
+  ResampleWaveform(orig_freq, wave, new_freq, new_wave);
+}
+
 
 /// @} End of "addtogroup feat"
 }  // namespace kaldi
diff --git a/src/fstbin/make-grammar-fst.cc b/src/fstbin/make-grammar-fst.cc
index f7fd46a4a55..fc9a17908f9 100644
--- a/src/fstbin/make-grammar-fst.cc
+++ b/src/fstbin/make-grammar-fst.cc
@@ -114,8 +114,9 @@ int main(int argc, char *argv[]) {
     std::string top_fst_str = po.GetArg(1),
         fst_out_str = po.GetArg(po.NumArgs());
 
-    ConstFst<StdArc> *top_fst = ReadAsConstFst(top_fst_str);
-    std::vector<std::pair<int32, const ConstFst<StdArc>* > > pairs;
+    std::shared_ptr<const ConstFst<StdArc> > top_fst(
+        ReadAsConstFst(top_fst_str));
+    std::vector<std::pair<int32, std::shared_ptr<const ConstFst<StdArc> > > > pairs;
 
     int32 num_pairs = (po.NumArgs() - 2) / 2;
     for (int32 i = 1; i <= num_pairs; i++) {
@@ -126,12 +127,13 @@ int main(int argc, char *argv[]) {
         KALDI_ERR << "Expected positive integer as nonterminal, got: "
                   << nonterm_str;
       std::string fst_str = po.GetArg(2*i + 1);
-      ConstFst<StdArc> *fst = ReadAsConstFst(fst_str);
-      pairs.push_back(std::pair<int32, const ConstFst<StdArc>* >(nonterminal, fst));
+      std::shared_ptr<const ConstFst<StdArc> > this_fst(ReadAsConstFst(fst_str));
+      pairs.push_back(std::pair<int32, std::shared_ptr<const ConstFst<StdArc> > >(
+          nonterminal, this_fst));
     }
 
     GrammarFst *grammar_fst = new GrammarFst(nonterm_phones_offset,
-                                             *top_fst,
+                                             top_fst,
                                              pairs);
 
     if (write_as_grammar) {
@@ -151,10 +153,6 @@ int main(int argc, char *argv[]) {
       cfst.Write(ko.Stream(), wopts);
     }
 
-    delete top_fst;
-    for (size_t i = 0; i < pairs.size(); i++)
-      delete pairs[i].second;
-
     KALDI_LOG << "Created grammar FST and wrote it to "
               << fst_out_str;
   } catch(const std::exception &e) {
diff --git a/src/fstext/determinize-lattice-inl.h b/src/fstext/determinize-lattice-inl.h
index 43ad809f70e..775228bfd21 100644
--- a/src/fstext/determinize-lattice-inl.h
+++ b/src/fstext/determinize-lattice-inl.h
@@ -510,7 +510,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
         if (!CheckMemoryUsage()) return false;
       }
       return (determinized_ = true);
-    } catch (std::bad_alloc) {
+    } catch (const std::bad_alloc &) {
       int32 repo_size = repository_.MemSize(),
           arcs_size = num_arcs_ * sizeof(TempArc),
           elems_size = num_elems_ * sizeof(Element),
@@ -520,7 +520,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
           << " (repo,arcs,elems) = ("
           << repo_size << "," << arcs_size << "," << elems_size << ")";
       return (determinized_ = false);
-    } catch (std::runtime_error) {
+    } catch (const std::runtime_error &) {
       KALDI_WARN << "Caught exception doing lattice determinization";
       return (determinized_ = false);
     }
diff --git a/src/fstext/lattice-weight.h b/src/fstext/lattice-weight.h
index af4826f7bed..86bec97d4e8 100644
--- a/src/fstext/lattice-weight.h
+++ b/src/fstext/lattice-weight.h
@@ -179,8 +179,7 @@ class LatticeWeightTpl {
     } else if (s == "-Infinity") {
       f = -numeric_limits<T>::infinity();
     } else if (s == "BadNumber") {
-      f = numeric_limits<T>::infinity();
-      f -= f; // get NaN
+      f = numeric_limits<T>::quiet_NaN();
     } else {
       char *p;
       f = strtod(s.c_str(), &p);
diff --git a/src/gmm/mle-diag-gmm.h b/src/gmm/mle-diag-gmm.h
index 24194ef886a..d41d36489bf 100644
--- a/src/gmm/mle-diag-gmm.h
+++ b/src/gmm/mle-diag-gmm.h
@@ -85,7 +85,7 @@ struct MapDiagGmmOptions {
   /// Tau value for the weights-- this tau value is applied
   /// per state, not per Gaussian.
   BaseFloat weight_tau;
-  
+
   MapDiagGmmOptions(): mean_tau(10.0),
                              variance_tau(50.0),
                              weight_tau(10.0) { }
@@ -150,8 +150,8 @@ class AccumDiagGmm {
       const MatrixBase<BaseFloat> &data,
       const VectorBase<BaseFloat> &frame_weights,
       int32 num_threads);
-  
-  
+
+
   /// Increment the stats for this component by the specified amount
   /// (not all parts may be taken, depending on flags).
   /// Note: x_stats and x2_stats are assumed to already be multiplied by "occ"
@@ -162,7 +162,7 @@ class AccumDiagGmm {
 
   /// Increment with stats from this other accumulator (times scale)
   void Add(double scale, const AccumDiagGmm &acc);
-  
+
   /// Smooths the accumulated counts by adding 'tau' extra frames. An example
   /// use for this is I-smoothing for MMIE.   Calls SmoothWithAccum.
   void SmoothStats(BaseFloat tau);
@@ -179,13 +179,13 @@ class AccumDiagGmm {
   void SmoothWithModel(BaseFloat tau, const DiagGmm &src_gmm);
 
   // Const accessors
-  const GmmFlagsType Flags() const { return flags_; }
+  GmmFlagsType Flags() const { return flags_; }
   const VectorBase<double> &occupancy() const { return occupancy_; }
   const MatrixBase<double> &mean_accumulator() const { return mean_accumulator_; }
   const MatrixBase<double> &variance_accumulator() const { return variance_accumulator_; }
 
   // used in testing.
-  void AssertEqual(const AccumDiagGmm &other); 
+  void AssertEqual(const AccumDiagGmm &other);
  private:
   int32 dim_;
   int32 num_comp_;
diff --git a/src/gmm/mle-full-gmm.h b/src/gmm/mle-full-gmm.h
index 6e770764e1e..618714b0e9b 100644
--- a/src/gmm/mle-full-gmm.h
+++ b/src/gmm/mle-full-gmm.h
@@ -1,7 +1,7 @@
 // gmm/mle-full-gmm.h
 
 // Copyright 2009-2011  Jan Silovsky;  Saarland University;
-//                      Microsoft Corporation; 
+//                      Microsoft Corporation;
 //                      Univ. Erlangen Nuremberg, Korbinian Riedhammer
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -91,7 +91,7 @@ class AccumFullGmm {
   void Resize(int32 num_components, int32 dim, GmmFlagsType flags);
   /// Calls Resize with arguments based on gmm_ptr_
   void Resize(const FullGmm &gmm, GmmFlagsType flags);
-  
+
   void ResizeVarAccumulator(int32 num_comp, int32 dim);
   /// Returns the number of mixture components
   int32 NumGauss() const { return num_comp_; }
@@ -122,8 +122,8 @@ class AccumFullGmm {
                                const VectorBase<BaseFloat> &data,
                                BaseFloat frame_posterior);
 
-  /// Accessors  
-  const GmmFlagsType Flags() const { return flags_; }
+  /// Accessors
+  GmmFlagsType Flags() const { return flags_; }
   const Vector<double> &occupancy() const { return occupancy_; }
   const Matrix<double> &mean_accumulator() const { return mean_accumulator_; }
   const std::vector<SpMatrix<double> > &covariance_accumulator() const { return covariance_accumulator_; }
diff --git a/src/gmmbin/gmm-init-biphone.cc b/src/gmmbin/gmm-init-biphone.cc
index 42a9d1a91a0..0775a5c7b23 100644
--- a/src/gmmbin/gmm-init-biphone.cc
+++ b/src/gmmbin/gmm-init-biphone.cc
@@ -52,12 +52,14 @@ void ReadSharedPhonesList(std::string rxfilename, std::vector<std::vector<int32>
 EventMap
 *GetFullBiphoneStubMap(const std::vector<std::vector<int32> > &phone_sets,
                        const std::vector<int32> &phone2num_pdf_classes,
-                       const std::vector<bool> &share_roots,
-                       const std::vector<int32> &ci_phones_list) {
+                       const std::vector<int32> &ci_phones_list,
+                       const std::vector<std::vector<int32> > &bi_counts,
+                       int32 biphone_min_count,
+                       const std::vector<int32> &mono_counts,
+                       int32 mono_min_count) {
 
   {  // Check the inputs
-    KALDI_ASSERT(!phone_sets.empty() &&
-                 share_roots.size() == phone_sets.size());
+    KALDI_ASSERT(!phone_sets.empty());
     std::set<int32> all_phones;
     for (size_t i = 0; i < phone_sets.size(); i++) {
       KALDI_ASSERT(IsSortedAndUniq(phone_sets[i]));
@@ -81,6 +83,14 @@ EventMap
     level1_map[ci_phones_list[i]] = new TableEventMap(kPdfClass, level2_map);
   }
 
+  // If there is not enough data for a biphone, we will revert to monophone
+  // and if there is not enough data for the monophone either, we will revert
+  // to zerophone (which is like a global garbage pdf) after initializing it.
+  int32 zerophone_pdf = -1;
+  // If a monophone state is created for a phone-set, the corresponding pdf will
+  // be stored in this vector.
+  std::vector<int32> monophone_pdf(phone_sets.size(), -1);
+
   for (size_t i = 0; i < phone_sets.size(); i++) {
 
     if (numpdfs_per_phone == 1) {
@@ -100,38 +110,68 @@ EventMap
         level1_map[pset[k]] = new TableEventMap(0, level2_map);
     } else {
       KALDI_ASSERT(numpdfs_per_phone == 2);
-      int32 base_pdfid = current_pdfid;
-      std::vector<int32> pset = phone_sets[i];  // All these will have a shared
+      std::vector<int32> right_phoneset = phone_sets[i];  // All these will have a shared
                                                 // event-map child
-      for (size_t k = 0; k < pset.size(); k++) {
-        // Create an event map for level2:
-        std::map<EventValueType, EventMap*> level2_map;  // key is 0
-        {
-          std::map<EventValueType, EventAnswerType> level3_map;  // key is kPdfClass
+      // Create an event map for level2:
+      std::map<EventValueType, EventMap*> level2_map;  // key is 0
+      {  // Handle CI phones
+        std::map<EventValueType, EventAnswerType> level3_map;  // key is kPdfClass
+        level3_map[0] = current_pdfid++;
+        level3_map[1] = current_pdfid++;
+        level2_map[0] = new TableEventMap(kPdfClass, level3_map);  // no-left-context case
+        for (size_t i = 0; i < ci_phones_list.size(); i++)  // ci-phone left-context cases
+          level2_map[ci_phones_list[i]] = new TableEventMap(kPdfClass, level3_map);
+      }
+      for (size_t j = 0; j < phone_sets.size(); j++) {
+        std::vector<int32> left_phoneset = phone_sets[j];  // All these will have a
+        // shared subtree with 2 pdfids
+        std::map<EventValueType, EventAnswerType> level3_map;  // key is kPdfClass
+        if (bi_counts.empty() ||
+            bi_counts[left_phoneset[0]][right_phoneset[0]] >= biphone_min_count) {
           level3_map[0] = current_pdfid++;
           level3_map[1] = current_pdfid++;
-          level2_map[0] = new TableEventMap(kPdfClass, level3_map);  // no-left-context case
-          for (size_t i = 0; i < ci_phones_list.size(); i++)  // ci-phone left-context cases
-            level2_map[ci_phones_list[i]] = new TableEventMap(kPdfClass, level3_map);
+        } else if (mono_counts.empty() ||
+                   mono_counts[right_phoneset[0]] > mono_min_count) {
+          //  Revert to mono.
+          KALDI_VLOG(2) << "Reverting to mono for biphone (" << left_phoneset[0]
+                        << "," << right_phoneset[0] << ")";
+          if (monophone_pdf[i] == -1) {
+            KALDI_VLOG(1) << "Reserving mono PDFs for phone-set " << i;
+            monophone_pdf[i] = current_pdfid++;
+            current_pdfid++; // num-pdfs-per-phone is 2
+          }
+          level3_map[0] = monophone_pdf[i];
+          level3_map[1] = monophone_pdf[i] + 1;
+        } else {
+          KALDI_VLOG(2) << "Reverting to zerophone for biphone ("
+                        << left_phoneset[0]
+                        << "," << right_phoneset[0] << ")";
+          // Revert to zerophone
+          if (zerophone_pdf == -1) {
+            KALDI_VLOG(1) << "Reserving zero PDFs.";
+            zerophone_pdf = current_pdfid++;
+            current_pdfid++; // num-pdfs-per-phone is 2
+          }
+          level3_map[0] = zerophone_pdf;
+          level3_map[1] = zerophone_pdf + 1;
         }
-        for (size_t j = 0; j < phone_sets.size(); j++) {
-          std::map<EventValueType, EventAnswerType> level3_map;  // key is kPdfClass
-          level3_map[0] = current_pdfid++;
-          level3_map[1] = current_pdfid++;
 
-          std::vector<int32> ipset = phone_sets[j];  // All these will have a
-                                                     // shared subtree with 2 pdfids
-          for (size_t ik = 0; ik < ipset.size(); ik++) {
-            level2_map[ipset[ik]] = new TableEventMap(kPdfClass, level3_map);
-          }
+        for (size_t k = 0; k < left_phoneset.size(); k++) {
+          int32 left_phone = left_phoneset[k];
+          level2_map[left_phone] = new TableEventMap(kPdfClass, level3_map);
         }
-        level1_map[pset[k]] = new TableEventMap(0, level2_map);
-        if (k != pset.size() - 1)
-          current_pdfid = base_pdfid;
+      }
+      for (size_t k = 0; k < right_phoneset.size(); k++) {
+        std::map<EventValueType, EventMap*> level2_copy;
+        for (auto const& kv: level2_map)
+          level2_copy[kv.first] = kv.second->Copy(std::vector<EventMap*>());
+        int32 right_phone = right_phoneset[k];
+        level1_map[right_phone] = new TableEventMap(0, level2_copy);
       }
     }
 
   }
+  KALDI_LOG << "Num PDFs: " << current_pdfid;
   return new TableEventMap(1, level1_map);
 }
 
@@ -139,7 +179,11 @@ EventMap
 ContextDependency*
 BiphoneContextDependencyFull(std::vector<std::vector<int32> > phone_sets,
                              const std::vector<int32> phone2num_pdf_classes,
-                             const std::vector<int32> &ci_phones_list) {
+                             const std::vector<int32> &ci_phones_list,
+                             const std::vector<std::vector<int32> > &bi_counts,
+                             int32 biphone_min_count,
+                             const std::vector<int32> &mono_counts,
+                             int32 mono_min_count) {
   // Remove all the CI phones from the phone sets
   std::set<int32> ci_phones;
   for (size_t i = 0; i < ci_phones_list.size(); i++)
@@ -159,13 +203,54 @@ BiphoneContextDependencyFull(std::vector<std::vector<int32> > phone_sets,
   int32 P = 1, N = 2;
   EventMap *pdf_map = GetFullBiphoneStubMap(phone_sets,
                                             phone2num_pdf_classes,
-                                            share_roots, ci_phones_list);
+                                            ci_phones_list, bi_counts,
+                                            biphone_min_count, mono_counts,
+                                            mono_min_count);
   return new ContextDependency(N, P, pdf_map);
 }
 
 
 } // end namespace kaldi
 
+/* This function reads the counts of biphones and monophones from a text file
+   generated for chain flat-start training. On each line there is either a
+   biphone count or a monophone count:
+   <left-phone-id> <right-phone-id> <count>
+   <monophone-id> <count>
+   The phone-id's are according to phones.txt.
+
+   It's more efficient to load the biphone counts into a map because
+   most entries are zero, but since there are not many biphones, a 2-dim vector
+   is OK. */
+static void ReadPhoneCounts(std::string &filename, int32 num_phones,
+                            std::vector<int32> *mono_counts,
+                            std::vector<std::vector<int32> > *bi_counts) {
+  // The actual phones start from id = 1 (so the last phone has id = num_phones).
+  mono_counts->resize(num_phones + 1, 0);
+  bi_counts->resize(num_phones + 1, std::vector<int>(num_phones + 1, 0));
+  std::ifstream infile(filename);
+  std::string line;
+  while (std::getline(infile, line)) {
+    std::istringstream iss(line);
+    int a, b;
+    long c;
+    if ((std::istringstream(line) >> a >> b >> c)) {
+      // It's a biphone count.
+      KALDI_ASSERT(a >= 0 && a <= num_phones);  // 0 means no-left-context
+      KALDI_ASSERT(b > 0 && b <= num_phones);
+      KALDI_ASSERT(c >= 0);
+      (*bi_counts)[a][b] = c;
+    } else if ((std::istringstream(line) >> b >> c)) {
+      // It's a monophone count.
+      KALDI_ASSERT(b > 0 && b <= num_phones);
+      KALDI_ASSERT(c >= 0);
+      (*mono_counts)[b] = c;
+    } else {
+      KALDI_ERR << "Bad line in phone stats file: " << line;
+    }
+  }
+}
+
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
@@ -179,7 +264,8 @@ int main(int argc, char *argv[]) {
         " gmm-init-biphone topo 39 bi.mdl bi.tree\n";
 
     bool binary = true;
-    std::string shared_phones_rxfilename;
+    std::string shared_phones_rxfilename, phone_counts_rxfilename;
+    int32 min_biphone_count = 100, min_mono_count = 20;
     std::string ci_phones_str;
     std::vector<int32> ci_phones;  // Sorted, uniqe vector of
     // context-independent phones.
@@ -191,6 +277,15 @@ int main(int argc, char *argv[]) {
                 "whose pdfs should be shared.");
     po.Register("ci-phones", &ci_phones_str, "Colon-separated list of "
                 "integer indices of context-independent phones.");
+    po.Register("phone-counts", &phone_counts_rxfilename,
+                "rxfilename containing, on each line, a biphone/phone and "
+                "its count in the training data.");
+    po.Register("min-biphone-count", &min_biphone_count, "Minimum number of "
+                "occurences of a biphone in training data to reserve pdfs "
+                "for it.");
+    po.Register("min-monophone-count", &min_mono_count, "Minimum number of "
+                "occurences of a monophone in training data to reserve pdfs "
+                "for it.");
     po.Read(argc, argv);
 
     if (po.NumArgs() != 4) {
@@ -214,7 +309,6 @@ int main(int argc, char *argv[]) {
         KALDI_ERR << "Invalid --ci-phones option: " << ci_phones_str;
     }
 
-
     Vector<BaseFloat> glob_inv_var(dim);
     glob_inv_var.Set(1.0);
     Vector<BaseFloat> glob_mean(dim);
@@ -235,6 +329,15 @@ int main(int argc, char *argv[]) {
                    phone2num_pdf_classes[phones[i]] == 2);
     }
 
+    std::vector<int32> mono_counts;
+    std::vector<std::vector<int32> > bi_counts;
+    if (!phone_counts_rxfilename.empty()) {
+      ReadPhoneCounts(phone_counts_rxfilename, phones.size(),
+                      &mono_counts, &bi_counts);
+      KALDI_LOG << "Loaded mono/bi phone counts.";
+    }
+
+
     // Now the tree:
     ContextDependency *ctx_dep = NULL;
     std::vector<std::vector<int32> > shared_phones;
@@ -247,7 +350,9 @@ int main(int argc, char *argv[]) {
       // ReadSharedPhonesList crashes on error.
     }
     ctx_dep = BiphoneContextDependencyFull(shared_phones, phone2num_pdf_classes,
-                                           ci_phones);
+                                           ci_phones, bi_counts,
+                                           min_biphone_count,
+                                           mono_counts, min_mono_count);
 
     int32 num_pdfs = ctx_dep->NumPdfs();
 
diff --git a/src/ivector/logistic-regression.cc b/src/ivector/logistic-regression.cc
index 5d02c013294..4eae2ebe3d7 100644
--- a/src/ivector/logistic-regression.cc
+++ b/src/ivector/logistic-regression.cc
@@ -55,8 +55,7 @@ void LogisticRegression::Train(const Matrix<BaseFloat> &xs,
 
   weights_.SetZero();
   TrainParameters(xs_with_prior, ys, conf, &xw);
-  KALDI_LOG <<
-    "Finished training parameters without mixture components." << std::endl;
+  KALDI_LOG << "Finished training parameters without mixture components.";
 
   // If we are using mixture components, we add those components
   // in MixUp and retrain with the extra weights.
@@ -64,8 +63,7 @@ void LogisticRegression::Train(const Matrix<BaseFloat> &xs,
     MixUp(ys, num_classes, conf);
     Matrix<BaseFloat> xw(xs_num_rows, weights_.NumRows());
     TrainParameters(xs_with_prior, ys, conf, &xw);
-    KALDI_LOG <<
-      "Finished training mixture components." << std::endl;
+    KALDI_LOG << "Finished training mixture components.";
   }
 }
 
@@ -87,8 +85,7 @@ void LogisticRegression::MixUp(const std::vector<int32> &ys,
                                   static_cast<int32>(0));
 
   KALDI_LOG << "Target number mixture components was " << conf.mix_up
-            << ". Training " << new_dim << " mixture components. "
-            << std::endl;
+            << ". Training " << new_dim << " mixture components.";
 
   int32 old_dim = weights_.NumRows(),
         num_components = old_dim,
diff --git a/src/ivectorbin/ivector-plda-scoring-dense.cc b/src/ivectorbin/ivector-plda-scoring-dense.cc
index 73ca879e6bc..e96f7de99d4 100644
--- a/src/ivectorbin/ivector-plda-scoring-dense.cc
+++ b/src/ivectorbin/ivector-plda-scoring-dense.cc
@@ -27,7 +27,14 @@
 namespace kaldi {
 
 bool EstPca(const Matrix<BaseFloat> &ivector_mat, BaseFloat target_energy,
-  Matrix<BaseFloat> *mat) {
+  const std::string &reco, Matrix<BaseFloat> *mat) {
+
+  // If the target_energy is 1.0, it's equivalent to not applying the
+  // conversation-dependent PCA at all, so it's better to exit this
+  // function before doing any computation.
+  if (ApproxEqual(target_energy, 1.0, 0.001))
+    return false;
+
   int32 num_rows = ivector_mat.NumRows(),
     num_cols = ivector_mat.NumCols();
   Vector<BaseFloat> sum;
@@ -50,6 +57,8 @@ bool EstPca(const Matrix<BaseFloat> &ivector_mat, BaseFloat target_energy,
     else
       Matrix<BaseFloat>(sumsq).Svd(&s, &P, NULL);
   } catch (...) {
+    KALDI_WARN << "Unable to compute conversation dependent PCA for"
+      << " recording " << reco << ".";
     return false;
   }
 
@@ -181,7 +190,7 @@ int main(int argc, char *argv[]) {
         for (size_t i = 0; i < ivectors.size(); i++) {
           ivector_mat.Row(i).CopyFromVec(ivectors[i]);
         }
-        if (EstPca(ivector_mat, target_energy, &pca_transform)) {
+        if (EstPca(ivector_mat, target_energy, reco, &pca_transform)) {
           // Apply the PCA transform to the raw i-vectors.
           ApplyPca(ivector_mat, pca_transform, &ivector_mat_pca);
 
@@ -192,8 +201,7 @@ int main(int argc, char *argv[]) {
           TransformIvectors(ivector_mat_pca, plda_config, this_plda,
             &ivector_mat_plda);
         } else {
-          KALDI_WARN << "Unable to compute conversation dependent PCA for"
-            << " recording " << reco << ".";
+          // If EstPca returns false, we won't apply any PCA.
           TransformIvectors(ivector_mat, plda_config, this_plda,
           &ivector_mat_plda);
         }
diff --git a/src/kwsbin/compute-atwv.cc b/src/kwsbin/compute-atwv.cc
index 6d9f6d2c2bb..0907baf268a 100644
--- a/src/kwsbin/compute-atwv.cc
+++ b/src/kwsbin/compute-atwv.cc
@@ -112,7 +112,7 @@ int main(int argc, char *argv[]) {
       if (vals.size() != 4) {
         KALDI_ERR << "Incorrect format of the reference file"
           << " -- 4 entries expected, " << vals.size() << " given!\n"
-          << "Key: " << kwid << std::endl;
+          << "Key: " << kwid;
       }
       KwsTerm inst(kwid, vals);
       aligner.AddRef(inst);
@@ -127,7 +127,7 @@ int main(int argc, char *argv[]) {
       if (vals.size() != 4) {
         KALDI_ERR << "Incorrect format of the hypotheses file"
           << " -- 4 entries expected, " << vals.size() << " given!\n"
-          << "Key: " << kwid << std::endl;
+          << "Key: " << kwid;
       }
       KwsTerm inst(kwid, vals);
       aligner.AddHyp(inst);
@@ -171,4 +171,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/lat/determinize-lattice-pruned.cc b/src/lat/determinize-lattice-pruned.cc
index 447c951d02c..22eae8199ff 100644
--- a/src/lat/determinize-lattice-pruned.cc
+++ b/src/lat/determinize-lattice-pruned.cc
@@ -665,8 +665,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
         continue;
       if (opts_.max_loop > 0 && counter++ > opts_.max_loop) {
         KALDI_ERR << "Lattice determinization aborted since looped more than "
-                  << opts_.max_loop << " times during epsilon closure.\n";
-        throw std::runtime_error("looped more than max-arcs times in lattice determinization");
+                  << opts_.max_loop << " times during epsilon closure.";
       }
       for (ArcIterator<ExpandedFst<Arc> > aiter(*ifst_, elem.state); !aiter.Done(); aiter.Next()) {
         const Arc &arc = aiter.Value();
diff --git a/src/lat/sausages.cc b/src/lat/sausages.cc
index 16a61b3f5eb..b851bc3604c 100644
--- a/src/lat/sausages.cc
+++ b/src/lat/sausages.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
 //           2015  Guoguo Chen
+//           2019  Dogan Can
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -52,10 +53,45 @@ void MinimumBayesRisk::MbrDecode() {
       }
       // build the outputs (time, confidences),
       if (R_[q] != 0 || opts_.print_silence) {
-        one_best_times_.push_back(times_[q]);
+        // see which 'item' from the sausage-bin should we select,
+        // (not necessarily the 1st one when MBR decoding disabled)
+        int32 s = 0;
+        for (int32 j=0; j<gamma_[q].size(); j++) {
+          if (gamma_[q][j].first == R_[q]) {
+            s = j;
+            break;
+          }
+        }
+        one_best_times_.push_back(times_[q][s]);
+        // post-process the times,
+        size_t i = one_best_times_.size();
+        if (i > 1 && one_best_times_[i-2].second > one_best_times_[i-1].first) {
+          // It's quite possible for this to happen, but it seems like it would
+          // have a bad effect on the downstream processing, so we fix it here.
+          // We resolve overlaps by redistributing the available time interval.
+          BaseFloat prev_right = i > 2 ? one_best_times_[i-3].second : 0.0;
+          BaseFloat left = std::max(prev_right,
+                                    std::min(one_best_times_[i-2].first,
+                                             one_best_times_[i-1].first));
+          BaseFloat right = std::max(one_best_times_[i-2].second,
+                                     one_best_times_[i-1].second);
+          BaseFloat first_dur =
+              one_best_times_[i-2].second - one_best_times_[i-2].first;
+          BaseFloat second_dur =
+              one_best_times_[i-1].second - one_best_times_[i-1].first;
+          BaseFloat mid = first_dur > 0 ? left + (right - left) * first_dur /
+                                     (first_dur + second_dur) : left;
+          one_best_times_[i-2].first = left;
+          one_best_times_[i-2].second = one_best_times_[i-1].first = mid;
+          one_best_times_[i-1].second = right;
+        }
         BaseFloat confidence = 0.0;
-        for (int32 j = 0; j < gamma_[q].size(); j++)
-          if (gamma_[q][j].first == R_[q]) confidence = gamma_[q][j].second;
+        for (int32 j = 0; j < gamma_[q].size(); j++) {
+          if (gamma_[q][j].first == R_[q]) {
+            confidence = gamma_[q][j].second;
+            break;
+          }
+        }
         one_best_confidences_.push_back(confidence);
       }
     }
@@ -146,11 +182,11 @@ void MinimumBayesRisk::AccStats() {
   std::vector<map<int32, double> > gamma(Q+1); // temp. form of gamma.
   // index 1...Q [word] -> occ.
 
-  // The tau arrays below are the sums over words of the tau_b
-  // and tau_e timing quantities mentioned in Appendix C of
-  // the paper... we are using these to get averaged times for
-  // the sausage bins, not specifically for the 1-best output.
-  Vector<double> tau_b(Q+1), tau_e(Q+1);
+  // The tau maps below are the sums over arcs with the same word label
+  // of the tau_b and tau_e timing quantities mentioned in Appendix C of
+  // the paper... we are using these to get averaged times for both the
+  // the sausage bins and the 1-best output.
+  std::vector<map<int32, double> > tau_b(Q+1), tau_e(Q+1);
 
   double Ltmp = EditDistance(N, Q, alpha, alpha_dash, alpha_dash_arc);
   if (L_ != 0 && Ltmp > L_) { // L_ != 0 is to rule out 1st iter.
@@ -190,8 +226,8 @@ void MinimumBayesRisk::AccStats() {
             // next: gamma(q, w(a)) += beta_dash_arc(q)
             AddToMap(w_a, beta_dash_arc(q), &(gamma[q]));
             // next: accumulating times, see decl for tau_b,tau_e
-            tau_b(q) += state_times_[s_a] * beta_dash_arc(q);
-            tau_e(q) += state_times_[n] * beta_dash_arc(q);
+            AddToMap(w_a, state_times_[s_a] * beta_dash_arc(q), &(tau_b[q]));
+            AddToMap(w_a, state_times_[n] * beta_dash_arc(q), &(tau_e[q]));
             break;
           case 2:
             beta_dash(s_a, q) += beta_dash_arc(q);
@@ -204,8 +240,8 @@ void MinimumBayesRisk::AccStats() {
             // WARNING: there was an error in Appendix C.  If we followed
             // the instructions there the next line would say state_times_[sa], but
             // it would be wrong.  I will try to publish an erratum.
-            tau_b(q) += state_times_[n] * beta_dash_arc(q);
-            tau_e(q) += state_times_[n] * beta_dash_arc(q);
+            AddToMap(0, state_times_[n] * beta_dash_arc(q), &(tau_b[q]));
+            AddToMap(0, state_times_[n] * beta_dash_arc(q), &(tau_e[q]));
             break;
           default:
             KALDI_ERR << "Invalid b_arc value"; // error in code.
@@ -222,8 +258,8 @@ void MinimumBayesRisk::AccStats() {
     AddToMap(0, beta_dash_arc(q), &(gamma[q]));
     // the statements below are actually redundant because
     // state_times_[1] is zero.
-    tau_b(q) += state_times_[1] * beta_dash_arc(q);
-    tau_e(q) += state_times_[1] * beta_dash_arc(q);
+    AddToMap(0, state_times_[1] * beta_dash_arc(q), &(tau_b[q]));
+    AddToMap(0, state_times_[1] * beta_dash_arc(q), &(tau_e[q]));
   }
   for (int32 q = 1; q <= Q; q++) { // a check (line 35)
     double sum = 0.0;
@@ -240,7 +276,8 @@ void MinimumBayesRisk::AccStats() {
   for (int32 q = 1; q <= Q; q++) {
     for (map<int32, double>::iterator iter = gamma[q].begin();
          iter != gamma[q].end(); ++iter)
-      gamma_[q-1].push_back(std::make_pair(iter->first, static_cast<BaseFloat>(iter->second)));
+      gamma_[q-1].push_back(
+          std::make_pair(iter->first, static_cast<BaseFloat>(iter->second)));
     // sort gamma_[q-1] from largest to smallest posterior.
     GammaCompare comp;
     std::sort(gamma_[q-1].begin(), gamma_[q-1].end(), comp);
@@ -250,18 +287,32 @@ void MinimumBayesRisk::AccStats() {
   // indexing.
   times_.clear();
   times_.resize(Q);
+  sausage_times_.clear();
+  sausage_times_.resize(Q);
   for (int32 q = 1; q <= Q; q++) {
-    times_[q-1].first = tau_b(q);
-    times_[q-1].second = tau_e(q);
-    if (times_[q-1].first > times_[q-1].second) // this is quite bad.
-      KALDI_WARN << "Times out of order";
-    if (q > 1 && times_[q-2].second > times_[q-1].first) {
+    double t_b = 0.0, t_e = 0.0;
+    for (std::vector<std::pair<int32, BaseFloat>>::iterator iter = gamma_[q-1].begin();
+         iter != gamma_[q-1].end(); ++iter) {
+      double w_b = tau_b[q][iter->first], w_e = tau_e[q][iter->first];
+      if (w_b > w_e)
+        KALDI_WARN << "Times out of order";  // this is quite bad.
+      times_[q-1].push_back(
+          std::make_pair(static_cast<BaseFloat>(w_b / iter->second),
+                         static_cast<BaseFloat>(w_e / iter->second)));
+      t_b += w_b;
+      t_e += w_e;
+    }
+    sausage_times_[q-1].first = t_b;
+    sausage_times_[q-1].second = t_e;
+    if (sausage_times_[q-1].first > sausage_times_[q-1].second)
+      KALDI_WARN << "Times out of order";  // this is quite bad.
+    if (q > 1 && sausage_times_[q-2].second > sausage_times_[q-1].first) {
       // We previously had a warning here, but now we'll just set both
       // those values to their average.  It's quite possible for this
       // condition to happen, but it seems like it would have a bad effect
       // on the downstream processing, so we fix it.
-      double avg = 0.5 * (times_[q-2].second + times_[q-1].first);
-      times_[q-2].second = times_[q-1].first = avg;
+      sausage_times_[q-2].second = sausage_times_[q-1].first =
+          0.5 * (sausage_times_[q-2].second + sausage_times_[q-1].first);
     }
   }
 }
@@ -371,7 +422,7 @@ MinimumBayesRisk::MinimumBayesRisk(const CompactLattice &clat_in,
   PrepareLatticeAndInitStats(&clat);
 
   R_ = words;
-  times_ = times;
+  sausage_times_ = times;
   L_ = 0.0;
 
   MbrDecode();
diff --git a/src/lat/sausages.h b/src/lat/sausages.h
index f613097b190..13f359c60d9 100644
--- a/src/lat/sausages.h
+++ b/src/lat/sausages.h
@@ -2,6 +2,7 @@
 
 // Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
 //           2015  Guoguo Chen
+//           2019  Dogan Can
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -104,17 +105,27 @@ class MinimumBayesRisk {
     return R_;
   }
 
+  const std::vector<std::vector<std::pair<BaseFloat, BaseFloat> > > GetTimes() const {
+    return times_; // returns average (start,end) times for each word in each
+    // bin. These are raw averages without any processing, i.e. time intervals
+    // from different bins can overlap.
+  }
+
   const std::vector<std::pair<BaseFloat, BaseFloat> > GetSausageTimes() const {
-    return times_; // returns average (start,end) times for each bin (each entry
-    // of GetSausageStats()).  Note: if you want the times for the one best,
-    // you can work out the one best yourself from the sausage stats and get the times
-    // at the same time.
+    return sausage_times_; // returns average (start,end) times for each bin.
+    // This is typically the weighted average of the times in GetTimes() but can
+    // be slightly different if the times for the bins overlap, in which case
+    // the times returned by this method do not overlap unlike the times
+    // returned by GetTimes().
   }
 
   const std::vector<std::pair<BaseFloat, BaseFloat> > &GetOneBestTimes() const {
-    return one_best_times_; // returns average (start,end) times for each bin corresponding
-    // to an entry in the one-best output.  This is just the appropriate
-    // subsequence of the times in SausageTimes().
+    return one_best_times_; // returns average (start,end) times for each word
+    // corresponding to an entry in the one-best output.  This is typically the
+    // appropriate subset of the times in GetTimes() but can be slightly
+    // different if the times for the one-best words overlap, in which case
+    // the times returned by this method do not overlap unlike the times
+    // returned by GetTimes().
   }
 
   /// Outputs the confidences for the one-best transcript.
@@ -122,8 +133,7 @@ class MinimumBayesRisk {
     return one_best_confidences_;
   }
 
-  /// Returns the expected WER over this sentence (assuming
-  /// model correctness.
+  /// Returns the expected WER over this sentence (assuming model correctness).
   BaseFloat GetBayesRisk() const { return L_; }
 
   const std::vector<std::vector<std::pair<int32, BaseFloat> > > &GetSausageStats() const {
@@ -222,15 +232,20 @@ class MinimumBayesRisk {
   // paper.  We sort in reverse order on the second member (posterior), so more
   // likely word is first.
 
-  std::vector<std::pair<BaseFloat, BaseFloat> > times_;
+  std::vector<std::vector<std::pair<BaseFloat, BaseFloat> > > times_;
+  // The average start and end times for words in each confusion-network bin.
+  // This is like an average over arcs, of the tau_b and tau_e quantities in
+  // Appendix C of the paper.  Indexed from zero, like gamma_ and R_.
+
+  std::vector<std::pair<BaseFloat, BaseFloat> > sausage_times_;
   // The average start and end times for each confusion-network bin.  This
   // is like an average over words, of the tau_b and tau_e quantities in
   // Appendix C of the paper.  Indexed from zero, like gamma_ and R_.
 
   std::vector<std::pair<BaseFloat, BaseFloat> > one_best_times_;
-  // one_best_times_ is a subsequence of times_, corresponding to
-  // (start,end) times of words in the one best output.  Actually these
-  // times are averages over the bin that each word came from.
+  // The average start and end times for words in the one best output.  This
+  // is like an average over the arcs, of the tau_b and tau_e quantities in
+  // Appendix C of the paper. Indexed from zero, like gamma_ and R_.
 
   std::vector<BaseFloat> one_best_confidences_;
   // vector of confidences for the 1-best output (which could be
diff --git a/src/latbin/lattice-1best.cc b/src/latbin/lattice-1best.cc
index e03736561f8..e5f3c578fbb 100644
--- a/src/latbin/lattice-1best.cc
+++ b/src/latbin/lattice-1best.cc
@@ -91,11 +91,11 @@ int main(int argc, char *argv[]) {
                    << "(no output)";
         n_err++;
       } else {
-        fst::ScaleLattice(fst::LatticeScale(1.0 / lm_scale, 1.0/acoustic_scale),
-                          &best_path);
         if (word_ins_penalty > 0.0) {
-          AddWordInsPenToCompactLattice(word_ins_penalty, &clat);
+          AddWordInsPenToCompactLattice(-word_ins_penalty, &best_path);
         }
+        fst::ScaleLattice(fst::LatticeScale(1.0 / lm_scale, 1.0/acoustic_scale),
+                          &best_path);
         compact_1best_writer.Write(key, best_path);
         n_done++;
       }
diff --git a/src/latbin/lattice-expand-ngram.cc b/src/latbin/lattice-expand-ngram.cc
index 1b8cfbee24b..1e7625d79e0 100644
--- a/src/latbin/lattice-expand-ngram.cc
+++ b/src/latbin/lattice-expand-ngram.cc
@@ -36,15 +36,15 @@ int main(int argc, char *argv[]) {
       "Usage: lattice-expand-ngram [options] lattice-rspecifier "
       "lattice-wspecifier\n"
       "e.g.: lattice-expand-ngram --n=3 ark:lat ark:expanded_lat\n";
-      
+
     ParseOptions po(usage);
     int32 n = 3;
 
     std::string word_syms_filename;
     po.Register("n", &n, "n-gram context to expand to.");
-    
+
     po.Read(argc, argv);
- 
+
     if (po.NumArgs() != 2) {
       po.PrintUsage();
       exit(1);
@@ -58,10 +58,10 @@ int main(int argc, char *argv[]) {
     fst::UnweightedNgramFst<CompactLatticeArc> expand_fst(n);
 
     SequentialCompactLatticeReader lat_reader(lats_rspecifier);
-    CompactLatticeWriter lat_writer(lats_wspecifier); 
+    CompactLatticeWriter lat_writer(lats_wspecifier);
 
     int32 n_done = 0, n_fail = 0;
-    
+
     for (; !lat_reader.Done(); lat_reader.Next()) {
       std::string key = lat_reader.Key();
       KALDI_LOG << "Processing lattice for key " << key;
@@ -69,14 +69,14 @@ int main(int argc, char *argv[]) {
       CompactLattice expanded_lat;
       ComposeDeterministicOnDemand(lat, &expand_fst, &expanded_lat);
       if (expanded_lat.Start() == fst::kNoStateId) {
-        KALDI_WARN << "Empty lattice for utterance " << key << std::endl;
+        KALDI_WARN << "Empty lattice for utterance " << key;
        n_fail++;
       } else {
         if (lat.NumStates() == expanded_lat.NumStates()) {
-          KALDI_LOG << "Lattice for key " << key 
+          KALDI_LOG << "Lattice for key " << key
             << " did not need to be expanded for order " << n << ".";
         } else {
-          KALDI_LOG << "Lattice expanded from " << lat.NumStates() << " to " 
+          KALDI_LOG << "Lattice expanded from " << lat.NumStates() << " to "
             << expanded_lat.NumStates() << " states for order " << n << ".";
         }
         lat_writer.Write(key, expanded_lat);
@@ -84,7 +84,7 @@ int main(int argc, char *argv[]) {
       }
       lat_reader.FreeCurrent();
     }
-    KALDI_LOG << "Processed " << n_done << " lattices with " << n_fail 
+    KALDI_LOG << "Processed " << n_done << " lattices with " << n_fail
       << " failures.";
     return 0;
   } catch(const std::exception &e) {
diff --git a/src/lm/arpa-file-parser.cc b/src/lm/arpa-file-parser.cc
index f3565eabf4e..53e4a1b61bf 100644
--- a/src/lm/arpa-file-parser.cc
+++ b/src/lm/arpa-file-parser.cc
@@ -74,7 +74,7 @@ void ArpaFileParser::Read(std::istream &is) {
   warning_count_ = 0;
   current_line_.clear();
 
-#define PARSE_ERR (KALDI_ERR << LineReference() << ": ")
+#define PARSE_ERR KALDI_ERR << LineReference() << ": "
 
   // Give derived class an opportunity to prepare its state.
   ReadStarted();
diff --git a/src/lm/arpa-lm-compiler-test.cc b/src/lm/arpa-lm-compiler-test.cc
index 697d70c416a..ccfd26af7e5 100644
--- a/src/lm/arpa-lm-compiler-test.cc
+++ b/src/lm/arpa-lm-compiler-test.cc
@@ -209,8 +209,7 @@ bool ThrowsExceptionTest(bool seps, const string &infile) {
     // Make memory cleanup easy in both cases of try-catch block.
     std::unique_ptr<ArpaLmCompiler> compiler(Compile(seps, infile));
     return false;
-  } catch (const std::runtime_error&) {
-    // Kaldi throws only std::runtime_error in kaldi-error.cc
+  } catch (const KaldiFatalError&) {
     return true;
   }
 }
diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk
index dbe0b6d9416..eb8cf743ab3 100644
--- a/src/makefiles/cuda_64bit.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -5,13 +5,13 @@ ifndef CUDATKDIR
 $(error CUDATKDIR not defined.)
 endif
 
-CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include -fPIC -pthread -isystem $(OPENFSTINC) -rdynamic --verbose
+CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include -fPIC -pthread -isystem $(OPENFSTINC)
 
 CUDA_INCLUDE= -I$(CUDATKDIR)/include -I$(CUBROOT)
 CUDA_FLAGS = --machine 64 -DHAVE_CUDA \
              -ccbin $(CXX) -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
              -std=c++11 -DCUDA_API_PER_THREAD_DEFAULT_STREAM  -lineinfo \
-						 -Xcompiler "$(CXXFLAGS)"
+             --verbose -Xcompiler "$(CXXFLAGS)"
 
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
-CUDA_LDLIBS += -lcublas -lcusparse -lcudart -lcurand -lnvToolsExt #LDLIBS : The libs are loaded later than static libs in implicit rule
+CUDA_LDLIBS += -lcublas -lcusparse -lcudart -lcurand -lnvToolsExt #LDLIBS : The .so libs are loaded later than static libs in implicit rule
diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk
index 25dafae2f3a..fcce90f5c21 100644
--- a/src/makefiles/default_rules.mk
+++ b/src/makefiles/default_rules.mk
@@ -125,7 +125,7 @@ valgrind: .valgrind
 #buid up dependency commands
 CC_SRCS=$(wildcard *.cc)
 #check if files exist to run dependency commands on
-ifneq ($(CC_SRCS),)										
+ifneq ($(CC_SRCS),)
 CC_DEP_COMMAND=$(CXX) -M $(CXXFLAGS) $(CC_SRCS)
 endif
 
diff --git a/src/makefiles/linux_x86_64_mkl.mk b/src/makefiles/linux_x86_64_mkl.mk
index 7a70fa51a65..d1c399d9796 100644
--- a/src/makefiles/linux_x86_64_mkl.mk
+++ b/src/makefiles/linux_x86_64_mkl.mk
@@ -22,7 +22,7 @@ ifndef MKLROOT
 $(error MKLROOT not defined.)
 endif
 
-MKLLIB ?= $(MKLROOT)/lib/em64t
+MKLLIB ?= $(MKLROOT)/lib/intel64
 
 CXXFLAGS = -std=c++11 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
diff --git a/src/matrix/tp-matrix.cc b/src/matrix/tp-matrix.cc
index f01ee1e8f46..6e34dc643e9 100644
--- a/src/matrix/tp-matrix.cc
+++ b/src/matrix/tp-matrix.cc
@@ -51,7 +51,7 @@ void TpMatrix<Real>::Invert() {
   // format, so we temporarily put in non-packed format.
   Matrix<Real> tmp(*this);
   int rows = static_cast<int>(this->num_rows_);
-  
+
   // ATLAS call.  It's really row-major ordering and a lower triangular matrix,
   // but there is some weirdness with Fortran-style indexing that we need to
   // take account of, so everything gets swapped.
@@ -102,14 +102,13 @@ void TpMatrix<Real>::Cholesky(const SpMatrix<Real> &orig) {
     }
     // d = orig(j, j) - d;
     d = orig_jdata[j] - d;
-    
+
     if (d >= 0.0) {
       // (*this)(j, j) = std::sqrt(d);
       jdata[j] = std::sqrt(d);
     } else {
-      KALDI_WARN << "Cholesky decomposition failed. Maybe matrix "
-          "is not positive definite. Throwing error";
-      throw std::runtime_error("Cholesky decomposition failed.");
+      KALDI_ERR << "Cholesky decomposition failed. Maybe matrix "
+          "is not positive definite.";
     }
   }
 }
@@ -144,5 +143,3 @@ template class TpMatrix<float>;
 template class TpMatrix<double>;
 
 }  // namespace kaldi
-
-
diff --git a/src/matrix/tp-matrix.h b/src/matrix/tp-matrix.h
index b215e73b000..e3b08701543 100644
--- a/src/matrix/tp-matrix.h
+++ b/src/matrix/tp-matrix.h
@@ -45,11 +45,11 @@ class TpMatrix : public PackedMatrix<Real> {
   /// Copy constructor from CUDA TpMatrix
   /// This is defined in ../cudamatrix/cu-tp-matrix.cc
   explicit TpMatrix(const CuTpMatrix<Real> &cu);
-  
-  
+
+
   template<typename OtherReal> explicit TpMatrix(const TpMatrix<OtherReal>& orig)
       : PackedMatrix<Real>(orig) {}
-  
+
   Real operator() (MatrixIndexT r, MatrixIndexT c) const {
     if (static_cast<UnsignedMatrixIndexT>(c) >
         static_cast<UnsignedMatrixIndexT>(r)) {
@@ -74,9 +74,9 @@ class TpMatrix : public PackedMatrix<Real> {
     return *(this->data_ + (r*(r+1)) / 2 + c);
     // Duplicating code from PackedMatrix.h
   }
-  // Note: Cholesky may throw std::runtime_error
+  // Note: Cholesky may throw KaldiFatalError.
   void Cholesky(const SpMatrix<Real>& orig);
-  
+
   void Invert();
 
   // Inverts in double precision.
@@ -99,7 +99,7 @@ class TpMatrix : public PackedMatrix<Real> {
 
   /// This is implemented in ../cudamatrix/cu-tp-matrix.cc
   void CopyFromMat(const CuTpMatrix<Real> &other);
-  
+
   /// CopyFromTp copies another triangular matrix into this one.
   void CopyFromTp(const TpMatrix<Real> &other) {
     PackedMatrix<Real>::CopyFromPacked(other);
@@ -132,4 +132,3 @@ class TpMatrix : public PackedMatrix<Real> {
 
 
 #endif
-
diff --git a/src/nnet2/combine-nnet-fast.cc b/src/nnet2/combine-nnet-fast.cc
index 02265a5f6ac..7ab2c9caf05 100644
--- a/src/nnet2/combine-nnet-fast.cc
+++ b/src/nnet2/combine-nnet-fast.cc
@@ -204,7 +204,7 @@ void FastNnetCombiner::CombineNnets(const Vector<double> &scale_params,
   int32 num_nnets = nnets.size();
   KALDI_ASSERT(num_nnets >= 1);
   int32 num_uc = nnets[0].NumUpdatableComponents();
-  KALDI_ASSERT(num_nnets * nnets[0].NumUpdatableComponents());
+  KALDI_ASSERT(nnets[0].NumUpdatableComponents() >= 1);
 
 
   *dest = nnets[0];
diff --git a/src/nnet2/combine-nnet.cc b/src/nnet2/combine-nnet.cc
index 417db1b84c4..57cc6133c58 100644
--- a/src/nnet2/combine-nnet.cc
+++ b/src/nnet2/combine-nnet.cc
@@ -31,9 +31,9 @@ static void CombineNnets(const Vector<BaseFloat> &scale_params,
   int32 num_nnets = nnets.size();
   KALDI_ASSERT(num_nnets >= 1);
   int32 num_uc = nnets[0].NumUpdatableComponents();
-  KALDI_ASSERT(num_nnets * nnets[0].NumUpdatableComponents());
-  
-  
+  KALDI_ASSERT(nnets[0].NumUpdatableComponents() >= 1);
+
+
   *dest = nnets[0];
   SubVector<BaseFloat> scale_params0(scale_params, 0, num_uc);
   dest->ScaleComponents(scale_params0);
@@ -59,7 +59,7 @@ static int32 GetInitialModel(
   for (int32 n = 0; n < num_nnets; n++) {
     BaseFloat objf = ComputeNnetObjf(nnets[n], validation_set,
                                      minibatch_size) / tot_frames;
-    
+
     if (n == 0 || objf > best_objf) {
       best_objf = objf;
       best_n = n;
@@ -98,7 +98,7 @@ static void GetInitialScaleParams(
       num_nnets = static_cast<int32>(nnets.size());
   if (initial_model < 0 || initial_model > num_nnets)
     initial_model = GetInitialModel(validation_set, nnets);
-  
+
   KALDI_ASSERT(initial_model >= 0 && initial_model <= num_nnets);
   int32 num_uc = nnets[0].NumUpdatableComponents();
 
@@ -107,7 +107,7 @@ static void GetInitialScaleParams(
     KALDI_LOG << "Initializing with neural net with index " << initial_model;
     // At this point we're using the best of the individual neural nets.
     scale_params->Set(0.0);
-    
+
     // Set the block of parameters corresponding to the "best" of the
     // source neural nets to
     SubVector<double> best_block(*scale_params, num_uc * initial_model, num_uc);
@@ -129,14 +129,14 @@ static double ComputeObjfAndGradient(
     Vector<double> *gradient) {
 
   Vector<BaseFloat> scale_params_float(scale_params);
-  
+
   Nnet nnet_combined;
   CombineNnets(scale_params_float, nnets, &nnet_combined);
-  
+
   Nnet nnet_gradient(nnet_combined);
   bool is_gradient = true;
   nnet_gradient.SetZero(is_gradient);
-  
+
   // note: "ans" is normalized by the total weight of validation frames.
   int32 batch_size = 1024;
   double ans = ComputeNnetGradient(nnet_combined,
@@ -146,7 +146,7 @@ static double ComputeObjfAndGradient(
 
   double tot_frames = validation_set.size();
   if (gradient != NULL) {
-    int32 i = 0; // index into scale_params.  
+    int32 i = 0; // index into scale_params.
     for (int32 n = 0; n < static_cast<int32>(nnets.size()); n++) {
       for (int32 j = 0; j < nnet_combined.NumComponents(); j++) {
         const UpdatableComponent *uc =
@@ -155,7 +155,7 @@ static double ComputeObjfAndGradient(
             dynamic_cast<const UpdatableComponent*>(&(nnet_gradient.GetComponent(j)));
         if (uc != NULL) {
           double dotprod = uc->DotProduct(*uc_gradient) / tot_frames;
-          (*gradient)(i) = dotprod; 
+          (*gradient)(i) = dotprod;
           i++;
         }
       }
@@ -165,14 +165,14 @@ static double ComputeObjfAndGradient(
 
   if (debug) {
     KALDI_LOG << "Double-checking gradient computation";
-    
+
     Vector<BaseFloat> manual_gradient(scale_params.Dim());
     for (int32 i = 0; i < scale_params.Dim(); i++) {
       double delta = 1.0e-04, fg = fabs((*gradient)(i));
       if (fg < 1.0e-07) fg = 1.0e-07;
       if (fg * delta < 1.0e-05)
         delta = 1.0e-05 / fg;
-      
+
       Vector<double> scale_params_temp(scale_params);
       scale_params_temp(i) += delta;
       double new_ans = ComputeObjfAndGradient(validation_set,
@@ -185,10 +185,10 @@ static double ComputeObjfAndGradient(
     KALDI_LOG << "Manually computed gradient is " << manual_gradient;
     KALDI_LOG << "Gradient we computed is " << *gradient;
   }
-  
+
   return ans;
 }
-                                   
+
 
 void CombineNnets(const NnetCombineConfig &combine_config,
                   const std::vector<NnetExample> &validation_set,
@@ -205,7 +205,7 @@ void CombineNnets(const NnetCombineConfig &combine_config,
   int32 dim = scale_params.Dim();
   KALDI_ASSERT(dim > 0);
   Vector<double> gradient(dim);
-  
+
   double objf, initial_objf;
 
   LbfgsOptions lbfgs_options;
@@ -213,11 +213,11 @@ void CombineNnets(const NnetCombineConfig &combine_config,
   lbfgs_options.m = dim; // Store the same number of vectors as the dimension
   // itself, so this is BFGS.
   lbfgs_options.first_step_impr = combine_config.initial_impr;
-  
+
   OptimizeLbfgs<double> lbfgs(scale_params,
                               lbfgs_options);
-  
-  for (int32 i = 0; i < combine_config.num_bfgs_iters; i++) {    
+
+  for (int32 i = 0; i < combine_config.num_bfgs_iters; i++) {
     scale_params.CopyFromVec(lbfgs.GetProposedValue());
     objf = ComputeObjfAndGradient(validation_set,
                                   scale_params,
@@ -227,9 +227,9 @@ void CombineNnets(const NnetCombineConfig &combine_config,
 
     KALDI_VLOG(2) << "Iteration " << i << " scale-params = " << scale_params
                   << ", objf = " << objf << ", gradient = " << gradient;
-    
+
     if (i == 0) initial_objf = objf;
-    
+
     lbfgs.DoStep(objf, gradient);
   }
 
@@ -244,10 +244,10 @@ void CombineNnets(const NnetCombineConfig &combine_config,
                                      nnets[0].NumUpdatableComponents());
   scale_params_mat.CopyRowsFromVec(scale_params_float);
   KALDI_LOG << "Final scale factors are " << scale_params_mat;
-  
+
   CombineNnets(scale_params_float, nnets, nnet_out);
 }
- 
-  
+
+
 } // namespace nnet2
 } // namespace kaldi
diff --git a/src/nnet2bin/nnet-am-compute.cc b/src/nnet2bin/nnet-am-compute.cc
index 32da30b73a5..fe07f9c6a25 100644
--- a/src/nnet2bin/nnet-am-compute.cc
+++ b/src/nnet2bin/nnet-am-compute.cc
@@ -94,7 +94,7 @@ int main(int argc, char *argv[]) {
     int64 num_done = 0, num_frames = 0;
 
     Vector<BaseFloat> inv_priors(am_nnet.Priors());
-    KALDI_ASSERT(inv_priors.Dim() == am_nnet.NumPdfs() &&
+    KALDI_ASSERT(!divide_by_priors || inv_priors.Dim() == am_nnet.NumPdfs() &&
                  "Priors in neural network not set up.");
     inv_priors.ApplyPow(-1.0);
 
diff --git a/src/nnet3/convolution.cc b/src/nnet3/convolution.cc
index 287ab7f47dd..1c5396949f8 100644
--- a/src/nnet3/convolution.cc
+++ b/src/nnet3/convolution.cc
@@ -976,7 +976,7 @@ static void ComputeTempMatrixSize(const ConvolutionComputationOptions &opts,
     // work out how many rows the temporary matrix should have, taking
     // into account the specified memory limit.
     temp_rows = computation->num_t_out * computation->num_images;
-    BaseFloat num_megabytes = (4 * temp_rows * temp_cols) / 1000000.0,
+    BaseFloat num_megabytes = (4 * (temp_rows / 1000.0) * (temp_cols / 1000.0)),
         megabyte_limit = opts.max_memory_mb;
     // C++ rounds down; here, we want to round up so we add one.
     int32 ratio = 1.0 + num_megabytes / megabyte_limit;
@@ -986,7 +986,7 @@ static void ComputeTempMatrixSize(const ConvolutionComputationOptions &opts,
     // >= temp_rows so that we don't have a small leftover piece.
     int32 new_num_t_out = (computation->num_t_out + ratio - 1) / ratio;
     temp_rows = new_num_t_out * computation->num_images;
-    BaseFloat new_num_megabytes = (4 * temp_rows * temp_cols) / 1000000.0;
+    BaseFloat new_num_megabytes = (4 * (temp_rows / 1000.0) * (temp_cols / 1000.0));
     // make sure we're within the memory limit.
     if (new_num_megabytes > 1.01 * megabyte_limit) {
       KALDI_WARN << "Memory consumed in convolution is more than requested "
diff --git a/src/nnet3/decodable-simple-looped.cc b/src/nnet3/decodable-simple-looped.cc
index 0452304cf55..71aa7daaa17 100644
--- a/src/nnet3/decodable-simple-looped.cc
+++ b/src/nnet3/decodable-simple-looped.cc
@@ -52,7 +52,6 @@ DecodableNnetSimpleLoopedInfo::DecodableNnetSimpleLoopedInfo(
   Init(opts, &(am_nnet->GetNnet()));
 }
 
-
 void DecodableNnetSimpleLoopedInfo::Init(
     const NnetSimpleLoopedComputationOptions &opts,
     Nnet *nnet) {
@@ -86,10 +85,8 @@ void DecodableNnetSimpleLoopedInfo::Init(
   CompileLooped(*nnet, opts.optimize_config, request1, request2, request3,
                 &computation);
   computation.ComputeCudaIndexes();
-  if (GetVerboseLevel() >= 3) {
-    KALDI_VLOG(3) << "Computation is:";
-    computation.Print(std::cerr, *nnet);
-  }
+  KALDI_VLOG(3) << "Computation is:\n"
+                << NnetComputationPrintInserter{computation, *nnet};
 }
 
 
diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc
index 0677e1ca474..a205490ee3f 100644
--- a/src/nnet3/natural-gradient-online.cc
+++ b/src/nnet3/natural-gradient-online.cc
@@ -119,14 +119,14 @@ void OnlineNaturalGradient::InitDefault(int32 D) {
   t_ = 0;
 }
 
-void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
-  int32 D = R0.NumCols();
+void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &X0) {
+  int32 D = X0.NumCols();
   // for locking reasons it's better to use a different object.
   OnlineNaturalGradient this_copy(*this);
   this_copy.InitDefault(D);
   this_copy.t_ = 1;  // Prevent recursion to Init() again.
 
-  CuMatrix<BaseFloat> R0_copy(R0.NumRows(), R0.NumCols(), kUndefined);
+  CuMatrix<BaseFloat> X0_copy(X0.NumRows(), X0.NumCols(), kUndefined);
   // 'num_iters' is number of iterations with the same data from a pseudorandom
   // start.  this is a faster way of starting than doing eigenvalue
   // decomposition.
@@ -134,11 +134,11 @@ void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
   // Note: we only do three iterations of initialization if we have enough data
   // that it's reasonably possible to estimate the subspace of dimension
   // this_copy.rank_.  If we don't have more than that many rows in our initial
-  // minibatch R0, we just do one iteration... this gives us almost exactly
-  // (barring small effects due to epsilon_ > 0) the row subspace of R0 after
+  // minibatch X0, we just do one iteration... this gives us almost exactly
+  // (barring small effects due to epsilon_ > 0) the row subspace of X0 after
   // one iteration anyway.
   int32 num_init_iters;
-  if (R0.NumRows() <= this_copy.rank_)
+  if (X0.NumRows() <= this_copy.rank_)
     num_init_iters = 1;
   else
     num_init_iters = 3;
@@ -147,8 +147,8 @@ void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
                                // initialize.
   for (int32 i = 0; i < num_init_iters; i++) {
     BaseFloat scale;
-    R0_copy.CopyFromMat(R0);
-    this_copy.PreconditionDirections(&R0_copy, &scale);
+    X0_copy.CopyFromMat(X0);
+    this_copy.PreconditionDirections(&X0_copy, &scale);
   }
   rank_ = this_copy.rank_;
   W_t_.Swap(&this_copy.W_t_);
@@ -197,7 +197,7 @@ void OnlineNaturalGradient::PreconditionDirections(
   t_ += 1;
 }
 
-void OnlineNaturalGradient::ReorthogonalizeXt1(
+void OnlineNaturalGradient::ReorthogonalizeRt1(
     const VectorBase<BaseFloat> &d_t1,
     BaseFloat rho_t1,
     CuMatrixBase<BaseFloat> *W_t1,
@@ -214,7 +214,7 @@ void OnlineNaturalGradient::ReorthogonalizeXt1(
   ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);
 
   temp_O->SymAddMat2(1.0, *W_t1, kNoTrans, 0.0);
-  // O_t =  E_t^{-0.5} W_t W_t^T E_t^{-0.5}
+  // O_{t+1} =  E_{t+1}^{-0.5} W_{t+1} W_{t+1}^T E_{t+1}^{-0.5}
   Matrix<BaseFloat> O_mat(*temp_O);
   SpMatrix<BaseFloat> O(O_mat, kTakeLower);
   for (int32 i = 0; i < R; i++) {
@@ -439,7 +439,7 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
     if (self_debug_) {
       KALDI_WARN << "Reorthogonalizing.";
     }
-    ReorthogonalizeXt1(d_t1,
+    ReorthogonalizeRt1(d_t1,
                        rho_t1,
                        &W_t1,
                        &J_t,
@@ -510,7 +510,7 @@ void OnlineNaturalGradient::ComputeWt1(int32 N,
   // B_t = J_t + (1-\eta)/(\eta/N) (D_t + \rho_t I) W_t
   J_t->AddDiagVecMat(1.0, w_t_coeff_gpu, W_t, kNoTrans, 1.0);
 
-  // A_t = (\eta/N) E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5} B_t
+  // A_t = (\eta/N) E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5}
   Matrix<BaseFloat> A_t(U_t, kTrans);
   for (int32 i = 0; i < R; i++) {
     BaseFloat i_factor = (eta / N) * sqrt_e_t1(i) * inv_sqrt_c_t(i);
diff --git a/src/nnet3/natural-gradient-online.h b/src/nnet3/natural-gradient-online.h
index a68ad9bbb53..77be28a19d4 100644
--- a/src/nnet3/natural-gradient-online.h
+++ b/src/nnet3/natural-gradient-online.h
@@ -375,8 +375,8 @@ namespace nnet3 {
    * Initialization *
 
    Now, a note on what we do on time t = 0, i.e. for the first minibatch.  We
-   initialize X_0 to the top R eigenvectors of 1/N X_0 X_0^T, where N is the
-   minibatch size (num-rows of R0).  If L is the corresponding RxR diagonal
+   initialize R_0 to the top R eigenvectors of 1/N X_0 X_0^T, where N is the
+   minibatch size (num-rows of X0).  If L is the corresponding RxR diagonal
    matrix of eigenvalues, then we will set D_0 = L - \rho_0 I.  We set \rho_0
    to ensure that
                       tr(F_0) = 1/N tr(X_0 X_0^T),
@@ -457,7 +457,7 @@ class OnlineNaturalGradient {
             not.
 
   */
-  void PreconditionDirections(CuMatrixBase<BaseFloat> *R,
+  void PreconditionDirections(CuMatrixBase<BaseFloat> *X,
                               BaseFloat *scale);
 
 
@@ -515,7 +515,7 @@ class OnlineNaturalGradient {
   // This function is called if C_t has high condition number; it makes sure
   // that R_{t+1} is orthogonal.  See the section in the extended comment above
   // on "keeping R_t orthogonal".
-  void ReorthogonalizeXt1(const VectorBase<BaseFloat> &d_t1,
+  void ReorthogonalizeRt1(const VectorBase<BaseFloat> &d_t1,
                           BaseFloat rho_t1,
                           CuMatrixBase<BaseFloat> *W_t1,
                           CuMatrixBase<BaseFloat> *temp_W,
diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index 584a7c19ab8..a3696403eba 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -880,7 +880,7 @@ void ComputationChecker::CheckComputationIndexes() const {
           KALDI_ERR << "Backprop input needed but not supplied.";
         if ((properties & kBackpropNeedsOutput) && c.arg4 == 0)
           KALDI_ERR << "Backprop output needed but not supplied.";
-        if (c.arg6 == 0 && !(properties && kUpdatableComponent)) {
+        if (c.arg6 == 0 && !(properties & kUpdatableComponent)) {
           // note: we could perhaps make this just a warning,
           // or optimize it away somehow.
           KALDI_ERR << "Backprop is done but has no effect.";
diff --git a/src/nnet3/nnet-batch-compute.cc b/src/nnet3/nnet-batch-compute.cc
index 5da55d0f70d..7124afb22b1 100644
--- a/src/nnet3/nnet-batch-compute.cc
+++ b/src/nnet3/nnet-batch-compute.cc
@@ -863,6 +863,40 @@ void MergeTaskOutput(
   }
   KALDI_ASSERT(cur_output_frame == num_output_frames);
 }
+void MergeTaskOutput(
+    const std::vector<NnetInferenceTask> &tasks,
+    CuMatrix<BaseFloat> *output) {
+  int32 num_tasks = tasks.size(),
+      num_output_frames = 0,
+      output_dim = -1;
+  for (int32 i = 0; i < num_tasks; i++) {
+    const NnetInferenceTask &task = tasks[i];
+    num_output_frames += task.num_used_output_frames;
+    if (i == 0) {
+      output_dim = (task.output_to_cpu ?
+                    task.output_cpu.NumCols() :
+                    task.output.NumCols());
+    }
+  }
+  KALDI_ASSERT(num_output_frames != 0 && output_dim != 0);
+  int32 cur_output_frame = 0;
+  output->Resize(num_output_frames, output_dim);
+  for (int32 i = 0; i < num_tasks; i++) {
+    const NnetInferenceTask &task = tasks[i];
+    int32 skip = task.num_initial_unused_output_frames,
+        num_used = task.num_used_output_frames;
+    KALDI_ASSERT(cur_output_frame == task.first_used_output_frame_index);
+    if (task.output_to_cpu) {
+      output->RowRange(cur_output_frame, num_used).CopyFromMat(
+          task.output_cpu.RowRange(skip, num_used));
+    } else {
+      output->RowRange(cur_output_frame, num_used).CopyFromMat(
+          task.output.RowRange(skip, num_used));
+    }
+    cur_output_frame += num_used;
+  }
+  KALDI_ASSERT(cur_output_frame == num_output_frames);
+}
 
 
 NnetBatchInference::NnetBatchInference(
diff --git a/src/nnet3/nnet-batch-compute.h b/src/nnet3/nnet-batch-compute.h
index 9861a28976c..bdc58e8cb4b 100644
--- a/src/nnet3/nnet-batch-compute.h
+++ b/src/nnet3/nnet-batch-compute.h
@@ -193,6 +193,9 @@ struct NnetBatchComputerOptions: public NnetSimpleComputationOptions {
 void MergeTaskOutput(
     const std::vector<NnetInferenceTask> &tasks,
     Matrix<BaseFloat> *output);
+void MergeTaskOutput(
+    const std::vector<NnetInferenceTask> &tasks,
+    CuMatrix<BaseFloat> *output);
 
 /**
    This class does neural net inference in a way that is optimized for GPU use:
diff --git a/src/nnet3/nnet-compile-utils-test.cc b/src/nnet3/nnet-compile-utils-test.cc
index 53820abf32a..894d0a3577b 100644
--- a/src/nnet3/nnet-compile-utils-test.cc
+++ b/src/nnet3/nnet-compile-utils-test.cc
@@ -95,7 +95,7 @@ void UnitTestSplitLocationsBackward(bool verbose) {
         num_locations : max_generated_submat_list_size;
     submat_lists[i].reserve(num_locations);
     for (int32 j = 0; j < num_locations; j++) {
-      if (j <= min_num_kaddrows)
+      if (j <= min_num_kaddrows && j < num_submat_indexes)
         // since we need min_num_kaddrows in the split_lists we ensure that
         // we add a pair with the same first element in all the submat_lists
         submat_lists[i].push_back(std::make_pair(submat_indexes[j],
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index 53859e9b03c..c6757b79c33 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -173,6 +173,10 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new LstmNonlinearityComponent();
   } else if (component_type == "BatchNormComponent") {
     ans = new BatchNormComponent();
+  } else if (component_type == "MeanNormComponent") {
+    ans = new MeanNormComponent();
+  } else if (component_type == "VarNormComponent") {
+    ans = new VarNormComponent();
   } else if (component_type == "TimeHeightConvolutionComponent") {
     ans = new TimeHeightConvolutionComponent();
   } else if (component_type == "RestrictedAttentionComponent") {
diff --git a/src/nnet3/nnet-component-test.cc b/src/nnet3/nnet-component-test.cc
index 618fa7c0c45..8bfe63acc3e 100644
--- a/src/nnet3/nnet-component-test.cc
+++ b/src/nnet3/nnet-component-test.cc
@@ -142,6 +142,11 @@ void TestNnetComponentUpdatable(Component *c) {
       KALDI_ERR << "Expected info strings to be equal: '"
                 << uc2->Info() << "' vs. '" << uc3->Info() << "'";
     }
+
+    // the following avoids a rare failure in the next check (due to roundoff).
+    delete uc3;
+    uc3 = dynamic_cast<UpdatableComponent*>(uc2->Copy());
+
     // testing that scaling by 0.5 works the same whether
     // done on the vectorized paramters or via Scale().
     Vector<BaseFloat> vec2(uc->NumParameters());
@@ -308,6 +313,9 @@ void TestSimpleComponentPropagateProperties(const Component &c) {
 
 bool TestSimpleComponentDataDerivative(const Component &c,
                                        BaseFloat perturb_delta) {
+  if (c.Type() == "MeanNormComponent")
+    return true;  // This test expected to fail for this type.
+
   MatrixStrideType input_stride_type = (c.Properties()&kInputContiguous) ?
       kStrideEqualNumCols : kDefaultStride;
   MatrixStrideType output_stride_type = (c.Properties()&kOutputContiguous) ?
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index 97d8b9045ea..a3571eeb532 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -514,17 +514,22 @@ struct NnetComputation {
   NnetComputation(): need_model_derivative(false) { }
 };
 
-
-
-
-// This operator is to print out the NnetComputation in a human-readable way, for
-// debugging purposes.
-// We don't give Read and Write functions to struct NnetComputation, because we
-// don't anticipate needing to write it to disk.
-std::ostream &operator << (std::ostream &os,
-                           NnetComputation &computation);
-
-
+// A helper class equipped with the stream insertion operator<< to print out
+// the NnetComputation in a human-readable way, with NnetComputation::Print(),
+// for debugging purposes, e.g.:
+//    KALDI_VLOG(3) << NnetComputationPrintInserter{mycomputation, mynet};
+struct NnetComputationPrintInserter {
+  const NnetComputation& computation;
+  const Nnet& nnet;
+  void Print(std::ostream& os) const {
+    computation.Print(os, nnet);
+  }
+  friend inline std::ostream &operator <<(std::ostream &os,
+                                          NnetComputationPrintInserter xhis) {
+    xhis.Print(os);
+    return os;
+  }
+};
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc
index d10c6fabd36..a38a54b62e6 100644
--- a/src/nnet3/nnet-normalize-component.cc
+++ b/src/nnet3/nnet-normalize-component.cc
@@ -676,5 +676,586 @@ void BatchNormComponent::ZeroStats() {
 }
 
 
+MeanNormComponent::MeanNormComponent(const MeanNormComponent &other):
+    dim_(other.dim_), block_dim_(other.block_dim_),
+    backprop_normalize_scale_(other.backprop_normalize_scale_),
+    test_mode_(other.test_mode_), count_(other.count_),
+    stats_sum_(other.stats_sum_), offset_(other.offset_) { }
+
+
+void MeanNormComponent::SetTestMode(bool test_mode) {
+  test_mode_ = test_mode;
+}
+
+void MeanNormComponent::Check() const {
+  KALDI_ASSERT(dim_ > 0 && block_dim_ > 0 && dim_ % block_dim_ == 0 &&
+               backprop_normalize_scale_ >= 0.0 &&
+               backprop_normalize_scale_ <= 1.0 &&
+               count_ >= 0 && stats_sum_.Dim() == block_dim_ &&
+               offset_.Dim() == block_dim_);
+}
+
+std::string MeanNormComponent::Info() const {
+  std::ostringstream stream;
+  Check();
+  stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_
+         << ", backprop-normalize-scale=" << backprop_normalize_scale_
+         << ", count=" << count_
+         << ", test-mode=" << (test_mode_ ? "true" : "false")
+         << ", offset=" << SummarizeVector(offset_);
+  return stream.str();
+}
+
+void MeanNormComponent::InitFromConfig(ConfigLine *cfl) {
+  dim_ = -1;
+  block_dim_ = -1;
+  test_mode_ = false;
+  backprop_normalize_scale_ = 0.0;
+  bool ok = cfl->GetValue("dim", &dim_);
+  cfl->GetValue("block-dim", &block_dim_);
+  cfl->GetValue("test-mode", &test_mode_);
+  cfl->GetValue("backprop-normalize-scale", &backprop_normalize_scale_);
+  if (!ok || dim_ <= 0) {
+    KALDI_ERR << "MeanNormComponent must have 'dim' specified, and > 0";
+  }
+  if (block_dim_ == -1)
+    block_dim_ = dim_;
+  if (!(block_dim_ > 0 && dim_ % block_dim_ == 0))
+    KALDI_ERR << "Invalid configuration in MeanNormComponent.";
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+  count_ = 0;
+  stats_sum_.Resize(block_dim_);
+  offset_.Resize(block_dim_);
+  Check();
+}
+
+
+void* MeanNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                   const CuMatrixBase<BaseFloat> &in,
+                                   CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(SameDim(in, *out) &&
+               (in.NumCols() == dim_ || in.NumCols() == block_dim_));
+  if (in.NumCols() != block_dim_) {
+    // if block_dim_ != dim_, we recurse; this helps keep the main code
+    // simple.
+    KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols());
+    int32 ratio = dim_ / block_dim_, orig_rows = in.NumRows(),
+        orig_cols = in.NumCols(), new_rows = orig_rows * ratio,
+        new_cols = orig_cols / ratio;
+    CuSubMatrix<BaseFloat> in_reshaped(in.Data(), new_rows, new_cols, new_cols),
+        out_reshaped(out->Data(), new_rows, new_cols, new_cols);
+    return Propagate(indexes, in_reshaped, &out_reshaped);
+  }
+
+  // From this point, we can assume that the num-cols of 'in' and 'out'
+  // equals block_dim_.
+  if (!test_mode_) {
+    Memo *memo = new Memo;
+    int32 num_frames = in.NumRows(), dim = block_dim_;
+    memo->num_frames = num_frames;
+    memo->sum_offset_temp.Resize(3, dim);
+    CuSubVector<BaseFloat> sum(memo->sum_offset_temp, 0),
+        offset(memo->sum_offset_temp, 1);
+    sum.AddRowSumMat(1.0, in, 0.0);
+    offset.CopyFromVec(sum);
+    offset.AddVec(1.0, stats_sum_);
+    offset.Scale(-1.0 / (num_frames + count_));
+
+    // the next command will do no work if out == in, for in-place propagation.
+    out->CopyFromMat(in);
+    out->AddVecToRows(1.0, offset, 1.0);
+    return static_cast<void*>(memo);
+  } else {
+    out->CopyFromMat(in);
+    out->AddVecToRows(1.0, offset_, 1.0);
+    return NULL;
+  }
+}
+
+void MeanNormComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in_value,  // unused
+    const CuMatrixBase<BaseFloat> &out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo_in,
+    Component *to_update,  // unused
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+
+  KALDI_ASSERT(SameDim(out_deriv, *in_deriv) &&
+               (out_deriv.NumCols() == dim_ ||
+                out_deriv.NumCols() == block_dim_));
+  if (out_deriv.NumCols() != block_dim_) {
+    // if block_dim_ != dim_, we recurse; this helps keep the main code
+    // simple.
+    KALDI_ASSERT(out_deriv.Stride() == out_deriv.NumCols() &&
+                 in_deriv->Stride() == in_deriv->NumCols());
+    int32 ratio = dim_ / block_dim_,
+        orig_rows = out_deriv.NumRows(),
+        orig_cols = out_deriv.NumCols(),
+        new_rows = orig_rows * ratio, new_cols = orig_cols / ratio;
+    CuSubMatrix<BaseFloat>
+        out_deriv_reshaped(out_deriv.Data(), new_rows, new_cols, new_cols),
+        in_deriv_reshaped(in_deriv->Data(), new_rows, new_cols, new_cols);
+    // we'll never use in_value or out_value, so pass them in unchanged.
+    Backprop(debug_info, indexes, in_value,
+             out_value, out_deriv_reshaped,
+             memo_in, to_update, &in_deriv_reshaped);
+    return;
+  }
+
+  Memo *memo = static_cast<Memo*>(memo_in);
+
+
+  // the following statement does no work if in_deriv and out_deriv are the
+  // same matrix.
+  in_deriv->CopyFromMat(out_deriv);
+
+  if (test_mode_ || backprop_normalize_scale_ == 0.0) {
+    return;
+  }
+
+  // OK, we are not in test mode, and backprop_normalize_scale_ is nonzero.
+
+  // See comment above declaration of class in nnet-normalize-component.h
+  // for the math.
+  KALDI_ASSERT(memo != NULL && "memo not passed into backprop");
+  CuSubVector<BaseFloat> deriv_offset(memo->sum_offset_temp, 2);
+
+  // set deriv_offset to
+  // -  backprop_normalize_scale * (\sum_i \hat{y}_i) / n
+  deriv_offset.AddRowSumMat(
+      -backprop_normalize_scale_ / out_deriv.NumRows(),
+      out_deriv);
+
+  // We already copied out_deriv to in_deriv.
+  in_deriv->AddVecToRows(1.0, deriv_offset);
+}
+
+void MeanNormComponent::StoreStats(
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &out_value,
+    void *memo_in) {
+  // in test mode this component does not store stats, it doesn't provide the
+  // kStoresStats flag.
+  KALDI_ASSERT(!test_mode_);
+  KALDI_ASSERT(out_value.NumCols() == dim_ || out_value.NumCols() == block_dim_);
+  if (out_value.NumCols() != block_dim_) {
+    // if block_dim_ != dim_, we recurse; this helps keep the main code
+    // simple.
+    KALDI_ASSERT(out_value.Stride() == out_value.NumCols());
+    int32 ratio = dim_ / block_dim_,
+        orig_rows = out_value.NumRows(),
+        orig_cols = out_value.NumCols(),
+        new_rows = orig_rows * ratio, new_cols = orig_cols / ratio;
+    CuSubMatrix<BaseFloat> out_value_reshaped(out_value.Data(), new_rows,
+                                              new_cols, new_cols);
+    // we'll never use in_value, so just pass it in unchanged.
+    StoreStats(in_value, out_value_reshaped, memo_in);
+    return;
+  }
+
+  Memo *memo = static_cast<Memo*>(memo_in);
+  KALDI_ASSERT(out_value.NumRows() == memo->num_frames);
+
+  CuSubVector<BaseFloat> sum(memo->sum_offset_temp, 0);
+  stats_sum_.AddVec(1.0, sum);
+  count_ += memo->num_frames;
+  KALDI_ASSERT(count_ > 0.0);
+  offset_.CopyFromVec(stats_sum_);
+  offset_.Scale(1.0 / count_);
+}
+
+void MeanNormComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<MeanNormComponent>", "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  ExpectToken(is, binary, "<BlockDim>");
+  ReadBasicType(is, binary, &block_dim_);
+  ExpectToken(is, binary, "<BackpropNormScale>");
+  ReadBasicType(is, binary, &backprop_normalize_scale_);
+  ExpectToken(is, binary, "<TestMode>");
+  ReadBasicType(is, binary, &test_mode_);
+  ExpectToken(is, binary, "<Count>");
+  ReadBasicType(is, binary, &count_);
+  // We write the mean, to make inspection of the on-disk format easier.
+  ExpectToken(is, binary, "<StatsMean>");
+  stats_sum_.Read(is, binary);
+  offset_ = stats_sum_;
+  stats_sum_.Scale(count_);
+  offset_.Scale(-1.0);
+  ExpectToken(is, binary, "</MeanNormComponent>");
+  Check();
+}
+
+void MeanNormComponent::Write(std::ostream &os, bool binary) const {
+  Check();
+  WriteToken(os, binary, "<MeanNormComponent>");
+  WriteToken(os, binary, "<Dim>");
+  WriteBasicType(os, binary, dim_);
+  WriteToken(os, binary, "<BlockDim>");
+  WriteBasicType(os, binary, block_dim_);
+  WriteToken(os, binary, "<BackpropNormScale>");
+  WriteBasicType(os, binary, backprop_normalize_scale_);
+  WriteToken(os, binary, "<TestMode>");
+  WriteBasicType(os, binary, test_mode_);
+  WriteToken(os, binary, "<Count>");
+  WriteBasicType(os, binary,  count_);
+  CuVector<BaseFloat> mean(stats_sum_);
+  if (count_ != 0) {
+    mean.Scale(1.0 / count_);
+  }
+  WriteToken(os, binary, "<StatsMean>");
+  mean.Write(os, binary);
+  WriteToken(os, binary, "</MeanNormComponent>");
+}
+
+void MeanNormComponent::Scale(BaseFloat scale) {
+  if (scale == 0) {
+    count_ = 0.0;
+    stats_sum_.SetZero();
+    offset_.SetZero();
+  } else {
+    count_ *= scale;
+    stats_sum_.Scale(scale);
+  }
+}
+
+
+void MeanNormComponent::Add(BaseFloat alpha, const Component &other_in) {
+  const MeanNormComponent *other =
+      dynamic_cast<const MeanNormComponent*>(&other_in);
+  count_ += alpha * other->count_;
+  stats_sum_.AddVec(alpha, other->stats_sum_);
+  if (count_ != 0.0) {
+    offset_.CopyFromVec(stats_sum_);
+    offset_.Scale(1.0 / count_);
+  }
+}
+
+VarNormComponent::VarNormComponent(const VarNormComponent &other):
+    dim_(other.dim_), block_dim_(other.block_dim_),
+    epsilon_(other.epsilon_), average_r_(other.average_r_),
+    test_mode_(other.test_mode_), count_(other.count_),
+    stats_sumsq_(other.stats_sumsq_), scale_(other.scale_),
+    r_count_(other.r_count_), r_sum_(other.r_sum_) { }
+
+void VarNormComponent::SetTestMode(bool test_mode) {
+  test_mode_ = test_mode;
+}
+
+void VarNormComponent::Check() const {
+  KALDI_ASSERT(dim_ > 0 && block_dim_ > 0 && dim_ % block_dim_ == 0 &&
+               epsilon_ > 0.0 &&
+               count_ >= 0 && r_count_ >= 0.0 &&
+               stats_sumsq_.Dim() == block_dim_ &&
+               scale_.Dim() == block_dim_ &&
+               r_sum_.Dim() == block_dim_);
+}
+
+std::string VarNormComponent::Info() const {
+  Check();
+  std::ostringstream stream;
+  stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_
+         << ", epsilon=" << epsilon_
+         << ", average-r=" << std::boolalpha << average_r_
+         << ", test-mode=" << (test_mode_ ? "true" : "false")
+         << ", count=" << count_
+         << ", scale=" << SummarizeVector(scale_)
+         << ", r-count=" << r_count_;
+  if (r_count_ != 0) {
+    CuVector<BaseFloat> r(r_sum_);
+    r.Scale(1.0 / r_count_);
+    stream << ", r=" << SummarizeVector(r);
+  }
+  return stream.str();
+}
+
+void VarNormComponent::InitFromConfig(ConfigLine *cfl) {
+  dim_ = -1;
+  block_dim_ = -1;
+  epsilon_ = 0.001;
+  test_mode_ = false;
+  average_r_ = false;
+  bool ok = cfl->GetValue("dim", &dim_);
+  cfl->GetValue("block-dim", &block_dim_);
+  cfl->GetValue("test-mode", &test_mode_);
+  cfl->GetValue("average-r", &average_r_);
+  cfl->GetValue("epsilon", &epsilon_);
+  if (!ok || dim_ <= 0) {
+    KALDI_ERR << "VarNormComponent must have 'dim' specified, and > 0";
+  }
+  if (block_dim_ == -1)
+    block_dim_ = dim_;
+  if (!(block_dim_ > 0 && dim_ % block_dim_ == 0))
+    KALDI_ERR << "Invalid configuration in VarNormComponent.";
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+  count_ = 0;
+  stats_sumsq_.Resize(block_dim_);
+  scale_.Resize(block_dim_, kUndefined);
+  scale_.Set(1.0);
+  r_count_ = 0.0;
+  r_sum_.Resize(block_dim_);
+  Check();
+}
+
+
+void* VarNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                  const CuMatrixBase<BaseFloat> &in,
+                                  CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(SameDim(in, *out) &&
+               (in.NumCols() == dim_ || in.NumCols() == block_dim_));
+  if (in.NumCols() != block_dim_) {
+    // if block_dim_ != dim_, we recurse; this helps keep the main code
+    // simple.
+    KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols());
+    int32 ratio = dim_ / block_dim_, orig_rows = in.NumRows(),
+        orig_cols = in.NumCols(), new_rows = orig_rows * ratio,
+        new_cols = orig_cols / ratio;
+    CuSubMatrix<BaseFloat> in_reshaped(in.Data(), new_rows, new_cols, new_cols),
+        out_reshaped(out->Data(), new_rows, new_cols, new_cols);
+    return Propagate(indexes, in_reshaped, &out_reshaped);
+  }
+
+  // From this point, we can assume that the num-cols of 'in' and 'out'
+  // equals block_dim_.
+  if (!test_mode_) {
+    Memo *memo = new Memo;
+    int32 num_frames = in.NumRows(), dim = block_dim_;
+    memo->num_frames = num_frames;
+    memo->sumsq_scale_temp.Resize(3, dim);
+    CuSubVector<BaseFloat> sumsq(memo->sumsq_scale_temp, 0),
+        scale(memo->sumsq_scale_temp, 1);
+    sumsq.AddDiagMat2(1.0, in, kTrans, 0.0);
+    scale.CopyFromVec(sumsq);
+    scale.AddVec(1.0, stats_sumsq_);
+    scale.Add((num_frames + count_) * epsilon_);
+    scale.Scale(1.0 / (num_frames + count_));
+    scale.ApplyPow(-0.5);
+    // the next command will do no work if out == in, for in-place propagation.
+    out->CopyFromMat(in);
+    out->MulColsVec(scale);
+    return static_cast<void*>(memo);
+  } else {
+    out->CopyFromMat(in);
+    out->MulColsVec(scale_);
+    return NULL;
+  }
+}
+
+void VarNormComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in_value,  // unused
+    const CuMatrixBase<BaseFloat> &out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo_in,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+
+  KALDI_ASSERT(SameDim(out_value, out_deriv) &&
+               SameDim(out_value, *in_deriv) &&
+               (out_value.NumCols() == dim_ ||
+                out_value.NumCols() == block_dim_));
+  if (out_value.NumCols() != block_dim_) {
+    // if block_dim_ != dim_, we recurse; this helps keep the main code
+    // simple.
+    KALDI_ASSERT(out_value.Stride() == out_value.NumCols() &&
+                 out_deriv.Stride() == out_deriv.NumCols() &&
+                 in_deriv->Stride() == in_deriv->NumCols());
+    int32 ratio = dim_ / block_dim_,
+        orig_rows = out_value.NumRows(),
+        orig_cols = out_value.NumCols(),
+        new_rows = orig_rows * ratio, new_cols = orig_cols / ratio;
+    CuSubMatrix<BaseFloat> out_value_reshaped(out_value.Data(), new_rows,
+                                              new_cols, new_cols),
+        out_deriv_reshaped(out_deriv.Data(), new_rows, new_cols, new_cols),
+        in_deriv_reshaped(in_deriv->Data(), new_rows, new_cols, new_cols);
+    // we'll never use in_value, so pass it in unchanged.
+    Backprop(debug_info, indexes, in_value,
+             out_value_reshaped, out_deriv_reshaped,
+             memo_in, to_update_in, &in_deriv_reshaped);
+    return;
+  }
+
+
+  if (test_mode_) {
+    // the following statement does no work if in_deriv and out_deriv are the
+    // same matrix.
+    in_deriv->CopyFromMat(out_deriv);
+    in_deriv->MulColsVec(scale_);
+    return;
+  } else {
+    Memo *memo = static_cast<Memo*>(memo_in);
+    CuSubVector<BaseFloat> sumsq(memo->sumsq_scale_temp, 0),
+        scale(memo->sumsq_scale_temp, 1),
+        r_sum(memo->sumsq_scale_temp, 2);
+
+    // We're computing:
+    // r :=  scale * (\sum_{i=1}^n \hat{y}_i y_i)
+    // (the real r is this divided by n).
+    r_sum.AddDiagMatMat(1.0, out_deriv, kTrans, out_value, kNoTrans, 0.0);
+
+
+    if (average_r_) {
+      // 'to_update' is where we'll store stats related to 'r'
+      VarNormComponent *to_update = dynamic_cast<VarNormComponent*>(to_update_in);
+      to_update->r_sum_.AddVec(1.0, r_sum);
+      to_update->r_count_ += memo->num_frames;
+      r_sum.AddVec(1.0, this->r_sum_);
+      r_sum.MulElements(scale);
+      BaseFloat tot_count = this->r_count_ + memo->num_frames;
+      in_deriv->CopyFromMat(out_deriv);
+      in_deriv->MulColsVec(scale);
+      in_deriv->AddMatDiagVec(-1.0 / tot_count, out_value, kNoTrans, r_sum);
+    } else {
+      r_sum.MulElements(scale);
+      // the following statement does no work if in_deriv and out_deriv are the
+      // same matrix.
+      in_deriv->CopyFromMat(out_deriv);
+      in_deriv->MulColsVec(scale);
+      in_deriv->AddMatDiagVec(-1.0 / memo->num_frames, out_value, kNoTrans, r_sum);
+    }
+  }
+}
+
+
+void VarNormComponent::StoreStats(
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &out_value,
+    void *memo_in) {
+  // in test mode this component does not store stats, it doesn't provide the
+  // kStoresStats flag.
+  KALDI_ASSERT(!test_mode_);
+  KALDI_ASSERT(out_value.NumCols() == dim_ || out_value.NumCols() == block_dim_);
+  if (out_value.NumCols() != block_dim_) {
+    // if block_dim_ != dim_, we recurse; this helps keep the main code
+    // simple.
+    KALDI_ASSERT(out_value.Stride() == out_value.NumCols());
+    int32 ratio = dim_ / block_dim_,
+        orig_rows = out_value.NumRows(),
+        orig_cols = out_value.NumCols(),
+        new_rows = orig_rows * ratio, new_cols = orig_cols / ratio;
+    CuSubMatrix<BaseFloat> out_value_reshaped(out_value.Data(), new_rows,
+                                              new_cols, new_cols);
+    // we'll never use in_value, so just pass it in unchanged.
+    StoreStats(in_value, out_value_reshaped, memo_in);
+    return;
+  }
+
+  Memo *memo = static_cast<Memo*>(memo_in);
+  KALDI_ASSERT(out_value.NumRows() == memo->num_frames);
+
+  CuSubVector<BaseFloat> sumsq(memo->sumsq_scale_temp, 0);
+  stats_sumsq_.AddVec(1.0, sumsq);
+  count_ += memo->num_frames;
+  KALDI_ASSERT(count_ > 0.0);
+  scale_.CopyFromVec(stats_sumsq_);
+  scale_.Add(count_ * epsilon_);
+  scale_.Scale(1.0 / count_);
+  scale_.ApplyPow(-0.5);
+}
+
+void VarNormComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<VarNormComponent>", "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  ExpectToken(is, binary, "<BlockDim>");
+  ReadBasicType(is, binary, &block_dim_);
+  ExpectToken(is, binary, "<Epsilon>");
+  ReadBasicType(is, binary, &epsilon_);
+  ExpectToken(is, binary, "<AverageR>");
+  ReadBasicType(is, binary, &average_r_);
+  ExpectToken(is, binary, "<TestMode>");
+  ReadBasicType(is, binary, &test_mode_);
+  ExpectToken(is, binary, "<Count>");
+  ReadBasicType(is, binary, &count_);
+  // We write the stats_sumsq divided by the count, to make inspection of the
+  // on-disk format easier.
+  ExpectToken(is, binary, "<Variance>");
+  stats_sumsq_.Read(is, binary);
+  scale_ = stats_sumsq_;
+  if (count_ != 0.0) {
+    scale_.Add(epsilon_);
+    scale_.ApplyPow(-0.5);
+  }
+  stats_sumsq_.Scale(count_);
+  ExpectToken(is, binary, "<RCount>");
+  ReadBasicType(is, binary, &r_count_);
+  ExpectToken(is, binary, "<RAvg>");
+  r_sum_.Read(is, binary);
+  r_sum_.Scale(r_count_);
+  ExpectToken(is, binary, "</VarNormComponent>");
+  Check();
+}
+
+void VarNormComponent::Write(std::ostream &os, bool binary) const {
+  Check();
+  WriteToken(os, binary, "<VarNormComponent>");
+  WriteToken(os, binary, "<Dim>");
+  WriteBasicType(os, binary, dim_);
+  WriteToken(os, binary, "<BlockDim>");
+  WriteBasicType(os, binary, block_dim_);
+  WriteToken(os, binary, "<Epsilon>");
+  WriteBasicType(os, binary, epsilon_);
+  WriteToken(os, binary, "<AverageR>");
+  WriteBasicType(os, binary, average_r_);
+  WriteToken(os, binary, "<TestMode>");
+  WriteBasicType(os, binary, test_mode_);
+  WriteToken(os, binary, "<Count>");
+  WriteBasicType(os, binary,  count_);
+  CuVector<BaseFloat> variance(stats_sumsq_);
+  if (count_ != 0) {
+    variance.Scale(1.0 / count_);
+  }
+  WriteToken(os, binary, "<Variance>");
+  variance.Write(os, binary);
+  WriteToken(os, binary, "<RCount>");
+  WriteBasicType(os, binary,  r_count_);
+  WriteToken(os, binary, "<RAvg>");
+  CuVector<BaseFloat> r_avg(r_sum_);
+  if (r_count_ != 0.0)
+    r_avg.Scale(1.0 / r_count_);
+  r_avg.Write(os, binary);
+  WriteToken(os, binary, "</VarNormComponent>");
+}
+
+void VarNormComponent::Scale(BaseFloat scale) {
+  if (scale == 0) {
+    count_ = 0.0;
+    stats_sumsq_.SetZero();
+    scale_.Set(1.0);
+    r_count_ = 0.0;
+    r_sum_.Set(0.0);
+  } else {
+    count_ *= scale;
+    stats_sumsq_.Scale(scale);
+    r_count_ *= scale;
+    r_sum_.Scale(scale);
+  }
+}
+
+
+void VarNormComponent::Add(BaseFloat alpha, const Component &other_in) {
+  const VarNormComponent *other =
+      dynamic_cast<const VarNormComponent*>(&other_in);
+  count_ += alpha * other->count_;
+  stats_sumsq_.AddVec(alpha, other->stats_sumsq_);
+  if (count_ != 0.0) {
+    scale_.CopyFromVec(stats_sumsq_);
+    scale_.Scale(1.0 / count_);
+    scale_.Add(epsilon_);
+    scale_.ApplyPow(-0.5);
+  } else {
+    scale_.Set(1.0);
+  }
+  r_count_ += alpha * other->r_count_;
+  r_sum_.AddVec(alpha, other->r_sum_);
+}
+
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h
index 37ad624d0f0..0b9c22d0d92 100644
--- a/src/nnet3/nnet-normalize-component.h
+++ b/src/nnet3/nnet-normalize-component.h
@@ -150,7 +150,7 @@ class NormalizeComponent: public Component {
                         means that the stats from n'th element of each
                         block are pooled into one class, for each n.
            epsilon      Small term added to the variance that is used to prevent
-                        division by zero
+                        division by zero.  Defaults to 0.001.
            target-rms   This defaults to 1.0, but if set, for instance, to 2.0,
                         it will normalize the standard deviation of the output to
                         2.0. 'target-stddev' might be a more suitable name, but this
@@ -212,7 +212,6 @@ class BatchNormComponent: public Component {
   virtual void Add(BaseFloat alpha, const Component &other);
   virtual void ZeroStats();
 
-
   virtual void DeleteMemo(void *memo) const { delete static_cast<Memo*>(memo); }
 
   virtual void StoreStats(const CuMatrixBase<BaseFloat> &in_value,
@@ -296,6 +295,370 @@ class BatchNormComponent: public Component {
 
 
 
+/*
+  MeanNormComponent
+
+  This component is intended to be used just before VarNormComponent, as a
+  substitute for BatchBormComponent that is suitable when different minibatches
+  might have different data distributions (e.g., might come from different
+  languages).  So together they perform the same function as Batch
+  Renormalization.
+
+  It adds an offset to its input that ensures that (averaged over time), the
+  input has zero mean.  It also, optionally, mean-normalizes the backpropagated
+  derivatives per minibatch, which might help stabilize training (although this
+  is kind of non-ideal from a theoretical convergence perspective; it would be
+  better to do some averaging over time, but this might interact in a nontrivial
+  way with max-change and the like).
+
+  What this component does can be summarized as follows; we'll write this for
+  a single dimension.  Assume the current minibatch size is n.  Then this
+  component does:
+
+        count += n
+            m += \sum_{i=1}^n  x_i
+
+  and in the forward propagation, it does:
+
+            y_i := x_i - (m / count)
+
+  In the backprop, it does:
+
+         \hat{x}_i := \hat{y}_i  -  backprop_normalize_scale * (\sum_i \hat{y}_i) / n
+
+  and the user can set backprop_normalize_scale to a value in the range [0, 1].
+
+  In test mode it doesn't update the stats.
+
+  You'd actually want to have 'count', and 'm', decaying over time, but that's
+  done as a call to this class, by calling Scale(), which scales these stats
+  (probably down).  You can call ScaleBatchnormStats() to have this called.
+
+    Accepted configuration values:
+           dim          Dimension of the input and output
+           block-dim    Defaults to 'dim', but may be set to a divisor
+                        of 'dim'.  In this case, each block of dimension 'block-dim'
+                        is treated like a separate row of the input matrix, which
+                        means that the stats from n'th element of each
+                        block are pooled into one class, for each n.
+   backprop-normalize-scale   Scaling value between 0 and 1 (default: 0)
+                        which affects the behavior in backprop.
+ */
+class MeanNormComponent: public Component {
+ public:
+
+  MeanNormComponent() { }
+
+  // call this with 'true' to set 'test mode', which will turn off accumulation
+  // of statistics.
+  void SetTestMode(bool test_mode);
+
+  // constructor using another component
+  MeanNormComponent(const MeanNormComponent &other);
+
+  virtual int32 InputDim() const { return dim_; }
+  virtual int32 OutputDim() const { return dim_; }
+
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual std::string Type() const { return "MeanNormComponent"; }
+  virtual int32 Properties() const {
+    // If the block-dim is less than the dim, we need the input and output
+    // matrices to be contiguous (stride==num-cols), as we'll be reshaping
+    // internally.
+    return kSimpleComponent|kPropagateInPlace|kBackpropInPlace|
+        (block_dim_ < dim_ ? kInputContiguous|kOutputContiguous : 0)|
+        (test_mode_ ? 0 : kUsesMemo|kStoresStats);
+  }
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *, // to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual Component* Copy() const { return new MeanNormComponent(*this); }
+
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  // Note: there is no ZeroStats() function-- we use the base-class one that
+  // does nothing-- since zeroing the stats at the beginning of each training
+  // jobs could adversely affect the performance of this method.  (The stats
+  // stored here are not purely diagnostic like they are for most components
+  // that store stats).
+
+  virtual void DeleteMemo(void *memo) const { delete static_cast<Memo*>(memo); }
+
+  virtual void StoreStats(const CuMatrixBase<BaseFloat> &in_value,
+                          const CuMatrixBase<BaseFloat> &out_value,
+                          void *memo);
+
+  // Members specific to this component type.
+  const CuVector<BaseFloat> &Offset() const { return offset_; }
+
+ private:
+
+  struct Memo {
+    // number of frames (after any reshaping).
+    int32 num_frames;
+    // Row 0 = sum = the sum of the rows of the input
+    // Row 1 is the offset we'll be using for this minibatch.
+    // Row 2 is a temporary that we use in backprop.
+    CuMatrix<BaseFloat> sum_offset_temp;
+  };
+
+  void Check() const;
+
+  // Dimension of the input and output.
+  int32 dim_;
+
+  // block_dim_ would normally be the same as dim_, but if it's less (and it
+  // must be > 0 and must divide dim_), then each separate block of the input of
+  // dimension 'block_dim_' is treated like a separate frame for the purposes of
+  // normalization.  This can be used to implement spatial batch normalization
+  // for convolutional setups-- assuming the filter-dim has stride 1, which it
+  // always will in the new code in nnet-convolutional-component.h.
+  int32 block_dim_;
+
+
+  // Value in the range [0.0, 1.0] that affects how the backprop works;
+  // see comment above the class definition for explanation.
+  BaseFloat backprop_normalize_scale_;
+
+  // If test mode is set this component doesn't store any further stats, but
+  // uses the stats that were previously stored.
+  bool test_mode_;
+
+  // total count of stats stored by StoreStats().
+  double count_;
+  // sum of the input data.
+  CuVector<BaseFloat> stats_sum_;
+
+  // offset_  is derived from stats_sum_ and count_; it's updated every
+  // time stats_sum_ and count_ are updated (e.g. when StoreStats() is called).
+  CuVector<BaseFloat> offset_;
+};
+
+
+/*
+  VarNormComponent
+
+  This component is intended to be used just after MeanNormComponent, as a
+  substitute for BatchBormComponent that is suitable when different minibatches
+  might have different data distributions (e.g., might come from different
+  languages).  So together they perform the same function as Batch
+  Renormalization.
+
+  This component scales its input per dimension, where the scale is computed,
+  based on moving average stats over multiple minibatches.  It also, per
+  minibatch, modifies the backpropagated statistics so as to reflect invariance
+  w.r.t. a scaling factor.  This is similar to Batch Renormalization.
+
+  What is does can be summarized as follows; we'll write this for a single
+  dimension.  Assume the current minibatch size is n.  Then this component does:
+
+        count += n
+            s += \sum_{i=1}^n  x_i^2
+        scale := (epsilon + s / count)^{-0.5}
+
+  and in the forward propagation, it does:
+
+            y_i := x_i * scale
+
+  In the backprop, it does (note: this is what we do in principle, but
+  wee the actual formula we use in practice further below):
+
+             r :=  scale * (\sum_{i=1}^n \hat{y}_i y_i) / (\sum_{i=1}^n y_i^2)
+
+   then we do:
+      \hat{x}_i := scale * \hat{y}_i - r * y_i
+
+   The formula above for r is the one that ensures that:
+          \sum_i \hat{x}_i x_i  = 0
+   which is the condition where the output is insensitive to a scaling of the
+   input.  This is kind of an approximation to what we really want; it would
+   be more ideal to accumulate a version of 'r' that was an average over many
+   minibatches, and use that.  We can have an option average_r to
+   allow time-averaging of r.
+
+   We can derive the formula for r as follows.  First replace x_i with y_i,
+    since the expression differs only by a positive scalar 'scale', and y_i
+    ends up being more convenient:
+          \sum_i \hat{x}_i y_i  = 0
+
+          \sum_i (scale * \hat{y}_i - r * y_i) * y_i = 0
+          \sum_i scale * \hat{y}_i * y_i - r * y_i^2 = 0
+          \sum_i scale * \hat{y}_i * y_i  = r * \sum_i y_i^2
+           r :=   scale * (\sum_i \hat{y}_i * y_i) / (\sum_i y_i^2)
+              \simeq  scale * (\sum_i \hat{y}_i * y_i) / n
+
+   Note: to simplify the formula for r, we make the approximation that
+   (\sum_{i=1}^n y_i^2) == n, which it is, almost, except when the input size is
+   comparable to epsilon_ or smaller (and we don't really care about this term
+   in that pathological case anyway-- this term's function is to prevent blowup
+   of the input, and we *want* blowup in that case).
+
+    so we let r be as follows:
+             r :=  scale * (\sum_{i=1}^n \hat{y}_i y_i) / n
+
+  You'd actually want to have 'count', and 's', decaying over time, but that's
+  done as a call to this class, by calling Scale(), which scales these stats
+  (probably down).  You can call ScaleBatchnormStats() to have this called.
+
+    Accepted configuration values:
+           dim          Dimension of the input and output
+           block-dim    Defaults to 'dim', but may be set to a divisor
+                        of 'dim'.  In this case, each block of dimension 'block-dim'
+                        is treated like a separate row of the input matrix, which
+                        means that the stats from n'th element of each
+                        block are pooled into one class, for each n.
+           epsilon      Minimum variance (added to the actual variance).
+                        Defaults to 0.001.
+          average-r     Boolean, default true.  If true, 'r' (the term that
+                        ensures that the input derivatives are insensitive to
+                        scaling of the input) is averaged over time, instead of
+                        being computed on the current minibatch.  As for the
+                        forward stats, the averaging over time (downweighting of
+                        old frames) is controlled externally to this class, via
+                        the utility function ScaleBatchnormStats(), which calls
+                        the Scale() function of this component.
+ */
+class VarNormComponent: public Component {
+ public:
+
+  VarNormComponent() { }
+
+  // call this with 'true' to set 'test mode', which will turn off accumulation
+  // of statistics.
+  void SetTestMode(bool test_mode);
+
+  // constructor using another component
+  VarNormComponent(const VarNormComponent &other);
+
+  virtual int32 InputDim() const { return dim_; }
+  virtual int32 OutputDim() const { return dim_; }
+
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual std::string Type() const { return "VarNormComponent"; }
+  virtual int32 Properties() const {
+    // If the block-dim is less than the dim, we need the input and output
+    // matrices to be contiguous (stride==num-cols), as we'll be reshaping
+    // internally.
+    return kSimpleComponent|kPropagateInPlace|kBackpropInPlace|
+        kBackpropNeedsOutput|
+        (block_dim_ < dim_ ? kInputContiguous|kOutputContiguous : 0)|
+        (test_mode_ ? 0 : kUsesMemo|kStoresStats);
+  }
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *, // to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual Component* Copy() const { return new VarNormComponent(*this); }
+
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  // Note: there is no ZeroStats() function-- we use the base-class one that
+  // does nothing-- since zeroing the stats at the beginning of each training
+  // jobs could adversely affect the performance of this method.  (The stats
+  // stored here are not purely diagnostic like they are for most components
+  // that store stats).
+
+  virtual void DeleteMemo(void *memo) const { delete static_cast<Memo*>(memo); }
+
+  virtual void StoreStats(const CuMatrixBase<BaseFloat> &in_value,
+                          const CuMatrixBase<BaseFloat> &out_value,
+                          void *memo);
+
+  // Members specific to this component type.
+  const CuVector<BaseFloat> &Scale() const { return scale_; }
+
+ private:
+
+  struct Memo {
+    // number of frames (after any reshaping).
+    int32 num_frames;
+    // Row 0 = sumsq = contains \sum_{i=1}^n x_i^2
+    // Row 1 = scale   (the scale we used in the forward propagation)
+    // Row 2 is used as a temporary in Backprop.
+    CuMatrix<BaseFloat> sumsq_scale_temp;
+  };
+
+  void Check() const;
+
+  // Dimension of the input and output.
+  int32 dim_;
+
+  // block_dim_ would normally be the same as dim_, but if it's less (and it
+  // must be > 0 and must divide dim_), then each separate block of the input of
+  // dimension 'block_dim_' is treated like a separate frame for the purposes of
+  // normalization.  This can be used to implement spatial batch normalization
+  // for convolutional setups-- assuming the filter-dim has stride 1, which it
+  // always will in the new code in nnet-convolutional-component.h.
+  int32 block_dim_;
+
+  // A configuration variable which is like a variance floor (but added, not
+  // floored).  Defaults to 0.001.
+  BaseFloat epsilon_;
+
+  // True if, in the backprop pass, we use stats averaged over multiple
+  // minibatches to compute the term that captures scale invariance of the
+  // derivatives w.r.t. the input.  (Note: MeanNormComponent does not have this
+  // feature because I don't think that cancelation of the derivative will even
+  // be necessary at all in case of the mean)
+  bool average_r_;
+
+  // If test mode is set this component doesn't store any further stats, but
+  // uses the stats that were previously stored.
+  bool test_mode_;
+
+  // total count of stats stored by StoreStats().
+  BaseFloat count_;
+  // sum of the input data squared, i.e. \sum_i x_i^2
+  CuVector<BaseFloat> stats_sumsq_;
+
+  // scale_ is derived from stats_sumsq_ and count_; it's updated every time
+  // stats_sumsq_ and count_ are updated (e.g. when StoreStats() is called).
+  CuVector<BaseFloat> scale_;
+
+  // The count of stats corresponding to 'r' (a quantity used in
+  // the backprop to cancel out the derivative w.r.t. the scaling
+  // factor).  Has the dimension of a number of frames, like count_,
+  // but we store it separately in case Backprop() is not called,
+  // and also as it's computed using a different path that has
+  // max-change code involved, so that may make the count differ.
+  BaseFloat r_count_;
+  // The sum of the 'r', scaled by r_count_; you'd divide by
+  // r_count_ to get the actual 'r' value.
+  CuVector<BaseFloat> r_sum_;
+
+};
+
+
+
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index e8c99494b06..32f49745c0c 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -4068,13 +4068,13 @@ bool CompositeComponent::IsUpdatable() const {
 int32 CompositeComponent::InputDim() const {
   KALDI_ASSERT(!components_.empty());
   return components_.front()->InputDim();
-};
+}
 
 // virtual
 int32 CompositeComponent::OutputDim() const {
   KALDI_ASSERT(!components_.empty());
   return components_.back()->OutputDim();
-};
+}
 
 // virtual
 int32 CompositeComponent::Properties() const {
@@ -4096,7 +4096,7 @@ int32 CompositeComponent::Properties() const {
   if (last_component_properties & kStoresStats)
     ans |= kBackpropNeedsOutput;
   return ans;
-};
+}
 
 
 MatrixStrideType CompositeComponent::GetStrideType(int32 i) const {
@@ -4319,7 +4319,7 @@ void CompositeComponent::Backprop(const std::string &debug_info,
       // optimization; other propagates might also be skippable.
       int32 properties = components_[num_components - 2]->Properties(),
           next_properties = components_[num_components - 1]->Properties();
-      if (!(properties & (kBackpropNeedsOutput || kUsesMemo)) &&
+      if (!(properties & (kBackpropNeedsOutput | kUsesMemo)) &&
           !(next_properties & kBackpropNeedsInput)) {
         num_components_to_propagate--;
       }
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index a8ef30bc314..5a709519db4 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -1400,7 +1400,7 @@ void ComputeExampleComputationRequestSimple(
 static void GenerateRandomComponentConfig(std::string *component_type,
                                           std::string *config) {
 
-  int32 n = RandInt(0, 37);
+  int32 n = RandInt(0, 39);
   BaseFloat learning_rate = 0.001 * RandInt(1, 100);
 
   std::ostringstream os;
@@ -1773,6 +1773,30 @@ static void GenerateRandomComponentConfig(std::string *component_type,
 
       break;
     }
+    case 38: {
+      *component_type = "MeanNormComponent";
+      if (RandInt(0,1) == 0) {
+        os << "dim=" << RandInt(10,20);
+      } else {
+        int32 block_dim = RandInt(5,10), dim = RandInt(2,3) * block_dim;
+        os << "block-dim=" << block_dim << " dim=" << dim;
+      }
+      //os << " backprop-normalize-scale=" << (0.5 * RandInt(0,2));
+      os << " backprop-normalize-scale=0";  // <-- or the standard tests will fail.
+      break;
+    }
+    case 39: {
+      *component_type = "VarNormComponent";
+      if (RandInt(0,1) == 0) {
+        os << "dim=" << RandInt(10,20);
+      } else {
+        int32 block_dim = RandInt(5,10), dim = RandInt(2,3) * block_dim;
+        os << "block-dim=" << block_dim << " dim=" << dim;
+      }
+      //os << " average-r=" << (RandInt(0,1) == 0 ? "true" : "false");
+      os << " average-r=false";
+      break;
+    }
     default:
       KALDI_ERR << "Error generating random component";
   }
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 8bc3f12027b..9c1765a5870 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -518,7 +518,9 @@ void SetDropoutProportion(BaseFloat dropout_proportion,
 bool HasBatchnorm(const Nnet &nnet) {
   for (int32 c = 0; c < nnet.NumComponents(); c++) {
     const Component *comp = nnet.GetComponent(c);
-    if (dynamic_cast<const BatchNormComponent*>(comp) != NULL)
+    if (dynamic_cast<const BatchNormComponent*>(comp) != NULL ||
+        dynamic_cast<const MeanNormComponent*>(comp) != NULL ||
+        dynamic_cast<const VarNormComponent*>(comp) != NULL)
       return true;
   }
   return false;
@@ -534,6 +536,12 @@ void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale,
     BatchNormComponent *bc = dynamic_cast<BatchNormComponent*>(comp);
     if (bc != NULL)
       bc->Scale(batchnorm_stats_scale);
+    MeanNormComponent *mc = dynamic_cast<MeanNormComponent*>(comp);
+    if (mc != NULL)
+      mc->Scale(batchnorm_stats_scale);
+    VarNormComponent *vc = dynamic_cast<VarNormComponent*>(comp);
+    if (vc != NULL)
+      vc->Scale(batchnorm_stats_scale);
   }
 }
 
@@ -558,6 +566,13 @@ void SetBatchnormTestMode(bool test_mode,  Nnet *nnet) {
     BatchNormComponent *bc = dynamic_cast<BatchNormComponent*>(comp);
     if (bc != NULL)
       bc->SetTestMode(test_mode);
+    MeanNormComponent *mc = dynamic_cast<MeanNormComponent*>(comp);
+    if (mc != NULL)
+      mc->SetTestMode(test_mode);
+    VarNormComponent *vc = dynamic_cast<VarNormComponent*>(comp);
+    if (vc != NULL)
+      vc->SetTestMode(test_mode);
+
   }
 }
 
@@ -1655,7 +1670,6 @@ class ModelCollapser {
                                                   component_index2);
   }
 
-
   /**
      Tries to produce a component that's equivalent to running the component
      'component_index2' with input given by 'component_index1'.  This handles
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 60a18f15d84..2cb7e68bc83 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -168,10 +168,12 @@ std::string NnetInfo(const Nnet &nnet);
 void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet);
 
 
-/// Returns true if nnet has at least one component of type BatchNormComponent.
+/// Returns true if nnet has at least one component of type BatchNormComponent,
+/// MeanNormComponent or VarNormComponent.
 bool HasBatchnorm(const Nnet &nnet);
 
-/// This function affects only components of type BatchNormComponent.
+/// This function affects only components of type BatchNormComponent,
+/// MeanNormComponent or VarNormComponent.
 /// It sets "test mode" on such components (if you call it with test_mode =
 /// true, otherwise it would set normal mode, but this wouldn't be needed
 /// often).  "test mode" means that instead of using statistics from the batch,
diff --git a/src/nnet3bin/nnet3-compute-batch.cc b/src/nnet3bin/nnet3-compute-batch.cc
index b0001c96f57..5d4b9b1db48 100644
--- a/src/nnet3bin/nnet3-compute-batch.cc
+++ b/src/nnet3bin/nnet3-compute-batch.cc
@@ -80,6 +80,10 @@ int main(int argc, char *argv[]) {
                 "priors stored with the model (in this case, "
                 "a .mdl file is expected as input).");
 
+#if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
+#endif
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
diff --git a/src/nnet3bin/nnet3-compute.cc b/src/nnet3bin/nnet3-compute.cc
index 45fde99a4f5..cf133025aae 100644
--- a/src/nnet3bin/nnet3-compute.cc
+++ b/src/nnet3bin/nnet3-compute.cc
@@ -78,6 +78,10 @@ int main(int argc, char *argv[]) {
                 "priors stored with the model (in this case, "
                 "a .mdl file is expected as input).");
 
+#if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
+#endif
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
diff --git a/src/nnet3bin/nnet3-latgen-faster-batch.cc b/src/nnet3bin/nnet3-latgen-faster-batch.cc
index fad2d5ed356..ec52cff9776 100644
--- a/src/nnet3bin/nnet3-latgen-faster-batch.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-batch.cc
@@ -108,6 +108,10 @@ int main(int argc, char *argv[]) {
     po.Register("use-gpu", &use_gpu,
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
 
+#if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
+#endif
+    
     po.Read(argc, argv);
 
     if (po.NumArgs() != 4) {
diff --git a/src/nnet3bin/nnet3-xvector-compute.cc b/src/nnet3bin/nnet3-xvector-compute.cc
index a4bc89a7def..e327681cf9b 100644
--- a/src/nnet3bin/nnet3-xvector-compute.cc
+++ b/src/nnet3bin/nnet3-xvector-compute.cc
@@ -113,6 +113,10 @@ int main(int argc, char *argv[]) {
     po.Register("pad-input", &pad_input, "If true, duplicate the first and "
       "last frames of the input features as required to equal min-chunk-size.");
 
+#if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
+#endif
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
diff --git a/src/nnetbin/cuda-gpu-available.cc b/src/nnetbin/cuda-gpu-available.cc
index 390468d3046..41d0227ce08 100644
--- a/src/nnetbin/cuda-gpu-available.cc
+++ b/src/nnetbin/cuda-gpu-available.cc
@@ -46,8 +46,7 @@ int main(int argc, char *argv[]) try {
     KALDI_WARN << "Cannot get hostname, " << strerror(errno);
   }
 #endif
-  KALDI_LOG << std::endl << std::endl
-    << "### IS CUDA GPU AVAILABLE? '" << hostname << "' ###";
+  KALDI_LOG << "\n\n### IS CUDA GPU AVAILABLE? '" << hostname << "' ###";
 #if HAVE_CUDA == 1
   CuDevice::Instantiate().SelectGpuId("yes");
   fprintf(stderr, "### HURRAY, WE GOT A CUDA GPU FOR COMPUTATION!!! ##\n\n");
@@ -76,9 +75,9 @@ int main(int argc, char *argv[]) try {
   return 0;
 #else
   std::cerr
-    << "### CUDA WAS NOT COMPILED IN! ###" << std::endl
+    << "### CUDA WAS NOT COMPILED IN! ###\n"
     << "To support CUDA, you must run 'configure' on a machine "
-    << "that has the CUDA compiler 'nvcc' available.";
+    << "that has the CUDA compiler 'nvcc' available.\n";
   return 1;
 #endif
 } catch (const std::exception &e) {
@@ -95,4 +94,3 @@ int main(int argc, char *argv[]) try {
     << "### - You should see your GPU (burnt GPUs may disappear from the list until reboot),";
   return -1;
 }
-
diff --git a/src/online/online-audio-source.cc b/src/online/online-audio-source.cc
index 7b3c31682aa..5998be0690f 100644
--- a/src/online/online-audio-source.cc
+++ b/src/online/online-audio-source.cc
@@ -72,18 +72,18 @@ OnlinePaSource::OnlinePaSource(const uint32 timeout,
                                &pa_ringbuf_, sizeof(SampleType),
                                rb_size_ / sizeof(SampleType), ring_buffer_);
   if (rbs != 0)
-    throw runtime_error("Unexpected PortAudio ring buffer init error");
+    KALDI_ERR << "PortAudio ring buffer init error";
 
   PaError paerr = Pa_Initialize();
   if (paerr != paNoError)
-    throw runtime_error("PortAudio initialization error");
+    KALDI_ERR << "PortAudio initialization error";
   // Monophone, 16-bit input hardcoded
   KALDI_ASSERT(sizeof(SampleType) == 2 &&
                "The current OnlinePaSource code assumes 16-bit input");
   paerr = Pa_OpenDefaultStream(&pa_stream_, 1, 0, paInt16, sample_rate_, 0,
                                PaCallback, this);
   if (paerr != paNoError)
-    throw runtime_error("PortAudio failed to open the default stream");
+    KALDI_ERR << "PortAudio failed to open the default stream";
 }
 
 
@@ -103,7 +103,7 @@ bool OnlinePaSource::Read(Vector<BaseFloat> *data) {
   if (!pa_started_) {  // start stream the first time Read() is called
     PaError paerr = Pa_StartStream(pa_stream_);
     if (paerr != paNoError)
-      throw std::runtime_error("Error while trying to open PortAudio stream");
+      KALDI_ERR << "Error while trying to open PortAudio stream";
     pa_started_ = true;
   }
   Timer timer;
diff --git a/src/online/online-audio-source.h b/src/online/online-audio-source.h
index d880660d24f..64153e9cd52 100644
--- a/src/online/online-audio-source.h
+++ b/src/online/online-audio-source.h
@@ -42,7 +42,7 @@ class OnlineAudioSourceItf {
   // The function returns true if there may be more data, and false if it
   // knows we are at the end of the stream.
   // In case an unexpected and unrecoverable error occurs the function throws
-  // an exception of type std::runtime_error (e.g. by using KALDI_ERR macro).
+  // an exception of type KaldiFatalError (by using KALDI_ERR macro).
   //
   // NOTE: The older version of this interface had a second paramater - "timeout".
   //       We decided to remove it, because we don't envision usage scenarios,
diff --git a/src/online/online-feat-input.h b/src/online/online-feat-input.h
index b730a373ac0..e433c386212 100644
--- a/src/online/online-feat-input.h
+++ b/src/online/online-feat-input.h
@@ -31,6 +31,7 @@
 
 #include "online-audio-source.h"
 #include "feat/feature-functions.h"
+#include "feat/feature-window.h"
 
 namespace kaldi {
 
@@ -275,7 +276,8 @@ class OnlineFeInput : public OnlineFeatInputItf {
   // "frame_size" - frame extraction window size in audio samples
   // "frame_shift" - feature frame width in audio samples
   OnlineFeInput(OnlineAudioSourceItf *au_src, E *fe,
-                const int32 frame_size, const int32 frame_shift);
+                const int32 frame_size, const int32 frame_shift,
+                const bool snip_edges = true);
 
   virtual int32 Dim() const { return extractor_->Dim(); }
 
@@ -287,15 +289,26 @@ class OnlineFeInput : public OnlineFeatInputItf {
   const int32 frame_size_;
   const int32 frame_shift_;
   Vector<BaseFloat> wave_; // the samples to be passed for extraction
+  Vector<BaseFloat> wave_remainder_; // the samples remained from the previous
+                                     // feature batch
+  FrameExtractionOptions frame_opts_;
 
   KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineFeInput);
 };
 
 template<class E>
 OnlineFeInput<E>::OnlineFeInput(OnlineAudioSourceItf *au_src, E *fe,
-                                   int32 frame_size, int32 frame_shift)
+                                int32 frame_size, int32 frame_shift,
+                                bool snip_edges)
     : source_(au_src), extractor_(fe),
-      frame_size_(frame_size), frame_shift_(frame_shift) {}
+      frame_size_(frame_size), frame_shift_(frame_shift) {
+      // we need a FrameExtractionOptions to call NumFrames()
+      // 1000 is just a fake sample rate which equates ms and samples
+      frame_opts_.samp_freq = 1000;
+      frame_opts_.frame_shift_ms = frame_shift;
+      frame_opts_.frame_length_ms = frame_size;
+      frame_opts_.snip_edges = snip_edges;
+}
 
 template<class E> bool
 OnlineFeInput<E>::Compute(Matrix<BaseFloat> *output) {
@@ -311,11 +324,26 @@ OnlineFeInput<E>::Compute(Matrix<BaseFloat> *output) {
 
   bool ans = source_->Read(&read_samples);
 
+  Vector<BaseFloat> all_samples(wave_remainder_.Dim() + read_samples.Dim());
+  all_samples.Range(0, wave_remainder_.Dim()).CopyFromVec(wave_remainder_);
+  all_samples.Range(wave_remainder_.Dim(), read_samples.Dim()).
+      CopyFromVec(read_samples);
+
   // Extract the features
-  if (read_samples.Dim() >= frame_size_) {
-    extractor_->Compute(read_samples, 1.0, output);
+  if (all_samples.Dim() >= frame_size_) {
+    // extract waveform remainder before calling Compute()
+    int32 num_frames = NumFrames(all_samples.Dim(), frame_opts_);
+    // offset is the amount at the start that has been extracted.
+    int32 offset = num_frames * frame_shift_;
+    int32 remaining_len = all_samples.Dim() - offset;
+    wave_remainder_.Resize(remaining_len);
+    KALDI_ASSERT(remaining_len >= 0);
+    if (remaining_len > 0)
+      wave_remainder_.CopyFromVec(SubVector<BaseFloat>(all_samples, offset, remaining_len));
+    extractor_->Compute(all_samples, 1.0, output);
   } else {
     output->Resize(0, 0);
+    wave_remainder_ = all_samples;
   }
 
   return ans;
diff --git a/src/online2/online-nnet2-feature-pipeline.h b/src/online2/online-nnet2-feature-pipeline.h
index d8f933a090d..e379f7263ec 100644
--- a/src/online2/online-nnet2-feature-pipeline.h
+++ b/src/online2/online-nnet2-feature-pipeline.h
@@ -228,18 +228,25 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   /// rescoring the lattices, this may not be much of an issue.
   void InputFinished();
 
-  // This function returns the ivector-extracting part of the feature pipeline
-  // (or NULL if iVectors are not being used); the pointer is owned here and not
-  // given to the caller.  This function is used in nnet3, and also in the
-  // silence-weighting code used to exclude silence from the iVector estimation.
+  // This function returns the iVector-extracting part of the feature pipeline
+  // (or NULL if iVectors are not being used); the pointer ownership is retained
+  // by this object and not transferred to the caller.  This function is used in
+  // nnet3, and also in the silence-weighting code used to exclude silence from
+  // the iVector estimation.
   OnlineIvectorFeature *IvectorFeature() {
     return ivector_feature_;
   }
 
+  // A const accessor for the iVector extractor. Returns NULL if iVectors are
+  // not being used.
+  const OnlineIvectorFeature *IvectorFeature() const {
+    return ivector_feature_;
+  }
+
   // This function returns the part of the feature pipeline that would be given
   // as the primary (non-iVector) input to the neural network in nnet3
   // applications.
- OnlineFeatureInterface *InputFeature() {
+  OnlineFeatureInterface *InputFeature() {
     return feature_plus_optional_pitch_;
   }
 
diff --git a/src/onlinebin/online-audio-client.cc b/src/onlinebin/online-audio-client.cc
index 241aee426cc..577204b65e7 100644
--- a/src/onlinebin/online-audio-client.cc
+++ b/src/onlinebin/online-audio-client.cc
@@ -85,7 +85,7 @@ int main(int argc, char** argv) {
 
     int32 client_desc = socket(AF_INET, SOCK_STREAM, 0);
     if (client_desc == -1) {
-      std::cerr << "ERROR: couldn't create socket!" << std::endl;
+      std::cerr << "ERROR: couldn't create socket!\n";
       return -1;
     }
 
@@ -96,8 +96,8 @@ int main(int argc, char** argv) {
     if (addr == INADDR_NONE) {
       hp = gethostbyname(server_addr_str.c_str());
       if (hp == NULL) {
-        std::cerr << "ERROR: couldn't resolve host string: " << server_addr_str
-                  << std::endl;
+        std::cerr << "ERROR: couldn't resolve host string: "
+                  << server_addr_str << '\n';
         close(client_desc);
         return -1;
       }
@@ -110,13 +110,13 @@ int main(int argc, char** argv) {
     server.sin_family = AF_INET;
     server.sin_port = htons(server_port);
     if (::connect(client_desc, (struct sockaddr*) &server, sizeof(server))) {
-      std::cerr << "ERROR: couldn't connect to server!" << std::endl;
+      std::cerr << "ERROR: couldn't connect to server!\n";
       close(client_desc);
       return -1;
     }
 
     KALDI_VLOG(2) << "Connected to KALDI server at host " << server_addr_str
-        << " port " << server_port << std::endl;
+        << " port " << server_port;
 
     char* pack_buffer = new char[packet_size];
 
@@ -124,7 +124,7 @@ int main(int argc, char** argv) {
     for (; !reader.Done(); reader.Next()) {
       std::string wav_key = reader.Key();
 
-      KALDI_VLOG(2) << "File: " << wav_key << std::endl;
+      KALDI_VLOG(2) << "File: " << wav_key;
 
       const WaveData &wav_data = reader.Value();
 
@@ -257,8 +257,7 @@ int main(int argc, char** argv) {
 
       {
         float speed = total_input_dur / total_reco_dur;
-        KALDI_VLOG(2) << "Recognized (" << speed << "xRT): " << reco_output
-            << std::endl;
+        KALDI_VLOG(2) << "Recognized (" << speed << "xRT): " << reco_output;
       }
 
       if (htk) {
@@ -266,7 +265,8 @@ int main(int argc, char** argv) {
         std::ofstream htk_file(name.c_str());
         for (size_t i = 0; i < results.size(); i++)
           htk_file << (int) (results[i].start * 10000000) << " "
-              << (int) (results[i].end * 10000000) << " " << results[i].word << std::endl;
+                   << (int) (results[i].end * 10000000) << " "
+                   << results[i].word << "\n";
         htk_file.close();
       }
 
@@ -309,12 +309,13 @@ int main(int argc, char** argv) {
         std::string name = wav_key + ".vtt";
         std::ofstream vtt_file(name.c_str());
 
-        vtt_file << "WEBVTT FILE" << std::endl << std::endl;
+        vtt_file << "WEBVTT FILE\n\n";
 
         for (size_t i = 0; i < subtitles.size(); i++)
-          vtt_file << (i + 1) << std::endl << TimeToTimecode(subtitles[i].start)
-              << " --> " << TimeToTimecode(subtitles[i].end) << std::endl
-              << subtitles[i].word << std::endl << std::endl;
+          vtt_file << (i + 1) << "\n"
+                   << TimeToTimecode(subtitles[i].start) << " --> "
+                   << TimeToTimecode(subtitles[i].end) << "\n"
+                   << subtitles[i].word << "\n\n";
 
         vtt_file.close();
       }
diff --git a/src/probe/README.slow_expf b/src/probe/README.slow_expf
index 00c9ce5be09..c20386b8137 100644
--- a/src/probe/README.slow_expf
+++ b/src/probe/README.slow_expf
@@ -1,5 +1,6 @@
-On some machines, expf() turns out to be very slow: much slower than its double precision counterpart exp().
-Probably this is concerned with the version of glibc.
+On some machines, expf() turns out to be very slow: much slower than its double
+precision counterpart exp().  Probably this is concerned with the version of
+glibc.
 
 Here are a couple of examples:
 
@@ -21,5 +22,7 @@ configuration$ ./exp-test
 exp() time: 0.0028439
 expf() time: 0.00713329
 
-If slow behaviour is detected, then KALDI_NO_EXPF macro will be used, and the Exp() wrapper in base/kaldi-math.h will use exp() even for single precision floats.
-The behaviour of expf() is considered to be slow if it is slower than exp() by at least 10%.
\ No newline at end of file
+If slow behaviour is detected, then KALDI_NO_EXPF macro will be used, and the
+Exp() wrapper in base/kaldi-math.h will use exp() even for single precision
+floats.  The behaviour of expf() is considered to be slow if it is slower than
+exp() by at least 10%.
diff --git a/src/probe/exp-test.cc b/src/probe/exp-test.cc
index 1fd8a64c6a6..d6cc76d4ce2 100644
--- a/src/probe/exp-test.cc
+++ b/src/probe/exp-test.cc
@@ -17,35 +17,52 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+// Read Makefile.slow_expf. This test must be compiled with -O0.
+
 #include <iostream>
 #include <cmath>
 #include "base/timer.h"
 
-#define SAMPLE 100000
+int main() {
+  int test_iter = 300000;
+
+  // Make sure that the CPU bumps its clock to full speed: run the first loop
+  // without timing. Then increase the sample iteration count exponentially
+  // until the loop takes at least 10ms. We run this loop 1/4 of the number of
+  // actual test iterations and call both exp() and expf(), so that the overall
+  // test run will take 20 to 60 ms, to ensure a sensibly measurable result.
+  for (bool first = true; ; first=false) {
+    kaldi::Timer timer;
+    for(int i = 0; i < test_iter; i += 4) {
+      (void)exp((double)(i & 0x0F));
+      (void)expf((double)(i & 0x0F));
+    }
+    double time = timer.Elapsed();
+    if (first) continue;
+    if (time > 0.01) break;
+    test_iter *= 3;
+  }
 
-int main() { 
-  float dummy = 0.0;
   kaldi::Timer exp_timer;
-  for(int i = 0; i < SAMPLE; ++i) {
-    dummy += exp((double)(i % 10));
+  for(int i = 0; i < test_iter; ++i) {
+    (void)exp((double)(i & 0x0F));
   }
   double exp_time = exp_timer.Elapsed();
 
   kaldi::Timer expf_timer;
-  for(int i = 0; i < SAMPLE; ++i) {
-    dummy += expf((double)(i % 10));
+  for(int i = 0; i < test_iter; ++i) {
+    (void)expf((double)(i & 0x0F));
   }
   double expf_time = expf_timer.Elapsed();
-  
-  // Often exp() and expf() perform very similarly, 
-  // so we will replace expf() by exp() only if there is at least 10% difference 
-  if (expf_time < exp_time * 1.1) { 
+
+  double ratio = expf_time / exp_time;
+  if (ratio < 1.1) {
+    // Often exp() and expf() perform very similarly, so we will replace expf()
+    // by exp() only if there is at least 10% difference.
     return 0;
-  } else {
-    std::cerr << "exp() time: " << exp_time << std::endl;
-    std::cerr << "expf() time: " << expf_time << std::endl;
-    return 1;
   }
-  
-  std::cerr << dummy << std::endl; // No complaint about the unused variable
+
+  std::cerr << ("WARNING: slow expf() detected. expf() is slower than exp() "
+                "by the factor of ") << ratio << "\n";
+  return 1;
 }
diff --git a/src/rnnlm/rnnlm-core-training.cc b/src/rnnlm/rnnlm-core-training.cc
index 5a1ae97895f..d1a01f7ef66 100644
--- a/src/rnnlm/rnnlm-core-training.cc
+++ b/src/rnnlm/rnnlm-core-training.cc
@@ -302,7 +302,7 @@ void RnnlmCoreTrainer::PrintMaxChangeStats() const {
                   << ", per-component max-change was enforced "
                   << ((100.0 * num_max_change_per_component_applied_[i]) /
                       num_minibatches_processed_)
-                  << "\% of the time.";
+                  << "% of the time.";
       i++;
     }
   }
@@ -312,7 +312,7 @@ void RnnlmCoreTrainer::PrintMaxChangeStats() const {
                  (num_minibatches_processed_ *
                  (config_.backstitch_training_scale == 0.0 ? 1.0 :
                  1.0 + 1.0 / config_.backstitch_training_interval))
-              << "\% of the time.";
+              << "% of the time.";
 }
 
 void RnnlmCoreTrainer::ProcessOutput(
diff --git a/src/rnnlm/rnnlm-embedding-training.cc b/src/rnnlm/rnnlm-embedding-training.cc
index c4238c7356a..0b5916b6bba 100644
--- a/src/rnnlm/rnnlm-embedding-training.cc
+++ b/src/rnnlm/rnnlm-embedding-training.cc
@@ -117,9 +117,9 @@ void RnnlmEmbeddingTrainer::TrainBackstitch(
     bool is_backstitch_step1,
     CuMatrixBase<BaseFloat> *embedding_deriv) {
 
-  // backstitch training is incompatible with momentum > 0  
+  // backstitch training is incompatible with momentum > 0
   KALDI_ASSERT(config_.momentum == 0.0);
-  
+
   // If relevant, do the following:
   // "embedding_deriv += - 2 * l2_regularize * embedding_mat_"
   // This is an approximate to the regular l2 regularization (add l2 regularization
@@ -130,7 +130,7 @@ void RnnlmEmbeddingTrainer::TrainBackstitch(
       embedding_deriv->AddMat(1.0 / (1.0 + config_.backstitch_training_scale) *
           l2_term, *embedding_mat_);
     }
-  } 
+  }
 
   BaseFloat scale = 1.0;
   if (config_.use_natural_gradient) {
@@ -213,7 +213,7 @@ void RnnlmEmbeddingTrainer::Train(
 }
 
 void RnnlmEmbeddingTrainer::TrainBackstitch(
-    bool is_backstitch_step1, 
+    bool is_backstitch_step1,
     const CuArrayBase<int32> &active_words,
     CuMatrixBase<BaseFloat> *embedding_deriv) {
 
@@ -232,7 +232,7 @@ void RnnlmEmbeddingTrainer::TrainBackstitch(
       embedding_deriv->AddRows(l2_term / (1.0 + config_.backstitch_training_scale),
                                *embedding_mat_, active_words);
     }
-  } 
+  }
   BaseFloat scale = 1.0;
   if (config_.use_natural_gradient) {
     if (is_backstitch_step1) preconditioner_.Freeze(true);
@@ -273,7 +273,7 @@ void RnnlmEmbeddingTrainer::PrintStats() {
                (num_minibatches_ *
                (config_.backstitch_training_scale == 0.0 ? 1.0 :
                1.0 + 1.0 / config_.backstitch_training_interval))
-            << " \% of the time.";
+            << " % of the time.";
 
   Matrix<BaseFloat> delta_embedding_mat(*embedding_mat_);
   delta_embedding_mat.AddMat(-1.0, initial_embedding_mat_);
diff --git a/src/tree/build-tree-questions.h b/src/tree/build-tree-questions.h
index a6bcfdd500b..22f12d62912 100644
--- a/src/tree/build-tree-questions.h
+++ b/src/tree/build-tree-questions.h
@@ -52,7 +52,7 @@ struct QuestionsForKey {  // Configuration class associated with a particular ke
   std::vector<std::vector<EventValueType> > initial_questions;
   RefineClustersOptions refine_opts;  // if refine_opts.max_iter == 0,
   // we just pick from the initial questions.
-  
+
   QuestionsForKey(int32 num_iters = 5): refine_opts(num_iters, 2) {
     // refine_cfg with 5 iters and top-n = 2 (this is no restriction because
     // RefineClusters called with 2 clusters; would get set to that anyway as
@@ -102,7 +102,9 @@ class Questions {  // careful, this is a class.
     KALDI_ASSERT(keys_out != NULL);
     CopyMapKeysToVector(key_idx_, keys_out);
   }
-  const bool HasQuestionsForKey(EventKeyType key) const { return (key_idx_.count(key) != 0); }
+  bool HasQuestionsForKey(EventKeyType key) const {
+    return (key_idx_.count(key) != 0);
+  }
   ~Questions() { kaldi::DeletePointers(&key_options_); }
 
 
diff --git a/src/tree/build-tree-utils.cc b/src/tree/build-tree-utils.cc
index 4c9be833185..254d7ec36d8 100644
--- a/src/tree/build-tree-utils.cc
+++ b/src/tree/build-tree-utils.cc
@@ -400,7 +400,7 @@ BaseFloat FindBestSplitForKey(const BuildTreeStatsType &stats,
     for (size_t i = 0;i < assignments.size();i++) if (assignments[i] == 1) yes_set.push_back(i);
   }
   *yes_set_out = yes_set;
-    
+
   DeletePointers(&clusters);
 #ifdef KALDI_PARANOID
   {  // Check the "ans" is correct.
@@ -763,10 +763,9 @@ EventMap *GetToLengthMap(const BuildTreeStatsType &stats, int32 P,
   std::vector<BuildTreeStatsType> stats_by_phone;
   try {
     SplitStatsByKey(stats, P, &stats_by_phone);
-  } catch(const std::runtime_error &err) {
-    KALDI_ERR << "Caught exception in GetToLengthMap: you seem "
-        "to have provided invalid stats [no central-phone "
-        "key].  Message was: " << err.what();
+  } catch(const KaldiFatalError &) {
+    KALDI_ERR <<
+        "You seem to have provided invalid stats [no central-phone key].";
   }
   std::map<EventValueType, EventAnswerType> phone_to_length;
   for (size_t p = 0; p < stats_by_phone.size(); p++) {
@@ -774,10 +773,9 @@ EventMap *GetToLengthMap(const BuildTreeStatsType &stats, int32 P,
       std::vector<BuildTreeStatsType> stats_by_length;
       try {
         SplitStatsByKey(stats_by_phone[p], kPdfClass, &stats_by_length);
-      } catch(const std::runtime_error &err) {
-        KALDI_ERR << "Caught exception in GetToLengthMap: you seem "
-            "to have provided invalid stats [no position "
-            "key].  Message was: " << err.what();
+      } catch(const KaldiFatalError &) {
+        KALDI_ERR <<
+            "You seem to have provided invalid stats [no position key].";
       }
       size_t length = stats_by_length.size();
       for (size_t i = 0; i < length; i++) {
@@ -868,7 +866,7 @@ EventMap *ClusterEventMapToNClustersRestrictedByMap(
     int32 *num_removed_ptr) {
   std::vector<BuildTreeStatsType> split_stats;
   SplitStatsByMap(stats, e_restrict, &split_stats);
-  
+
   if (num_clusters_required < split_stats.size()) {
     KALDI_WARN << "num-clusters-required is less than size of map. Not doing anything.";
     if (num_removed_ptr) *num_removed_ptr = 0;
@@ -904,10 +902,10 @@ EventMap *ClusterEventMapToNClustersRestrictedByMap(
           if (j > max_index) max_index = j;
         }
       }
-      
+
       normalizer += SumClusterableNormalizer(summed_stats_contiguous[i]);
-    } else { 
-      // Even if split_stats[i] is empty, a cluster will be assigned to 
+    } else {
+      // Even if split_stats[i] is empty, a cluster will be assigned to
       // that. To compensate, we decrease the num-clusters required.
       num_non_empty_clusters_required--;
     }
@@ -919,7 +917,7 @@ EventMap *ClusterEventMapToNClustersRestrictedByMap(
   if (num_non_empty_clusters_required > num_non_empty_clusters) {
     KALDI_WARN << "Cannot get required num-clusters " << num_clusters_required
                << " as number of non-empty clusters required is larger than "
-               << " number of non-empty clusters: " << num_non_empty_clusters_required 
+               << " number of non-empty clusters: " << num_non_empty_clusters_required
                << " > " << num_non_empty_clusters;
     if (num_removed_ptr) *num_removed_ptr = 0;
     return e_in.Copy();
@@ -929,7 +927,7 @@ EventMap *ClusterEventMapToNClustersRestrictedByMap(
   BaseFloat change = ClusterBottomUpCompartmentalized(
       summed_stats_contiguous,
       std::numeric_limits<BaseFloat>::infinity(),
-      num_non_empty_clusters_required,  
+      num_non_empty_clusters_required,
       NULL,  // don't need clusters out.
       &assignments);  // this algorithm is quadratic, so might be quite slow.
 
@@ -1052,7 +1050,7 @@ EventMap *GetStubMap(int32 P,
     // Do a split.  Recurse.
     size_t half_sz = phone_sets.size() / 2;
     std::vector<std::vector<int32> >::const_iterator half_phones =
-        phone_sets.begin() + half_sz;  
+        phone_sets.begin() + half_sz;
     std::vector<bool>::const_iterator half_share =
         share_roots.begin() + half_sz;
     std::vector<std::vector<int32> > phone_sets_1, phone_sets_2;
@@ -1127,4 +1125,3 @@ bool ConvertStats(int32 oldN, int32 oldP, int32 newN, int32 newP,
 
 
 } // end namespace kaldi
-
diff --git a/src/tree/cluster-utils-test.cc b/src/tree/cluster-utils-test.cc
index fd5d9690939..8eee3fb5505 100644
--- a/src/tree/cluster-utils-test.cc
+++ b/src/tree/cluster-utils-test.cc
@@ -97,10 +97,11 @@ static void TestObjfPlus() {
   AssertEqual(a.Objf(), (BaseFloat)0.0);
   AssertEqual(b.Objf(), (BaseFloat)0.0);
   AssertEqual( a.ObjfPlus(b), -0.5 * (1.0-2.5)*(1.0-2.5));  // 0.5 because half-distance, squared = 1/4, times two points...
-  KALDI_LOG << "Non-binary Output: "<<'\n';
-  a.Write(KALDI_LOG, false);
-  KALDI_LOG << "Binary Output: "<<'\n';
-  a.Write(KALDI_LOG, true);
+  KALDI_LOG << "Non-binary Output:";
+  a.Write(std::cerr, false);
+  std::cerr << "\nBinary Output:\n";
+  a.Write(std::cerr, true);
+  std::cerr << "\n";
 }
 
 static void TestObjfMinus() {
@@ -395,7 +396,7 @@ static void TestClusterKMeansVector() {
     std::vector<Clusterable*> points;
     for (size_t j = 0; j < n_clust; j++) {
       size_t n_points = 1 + Rand() % 5;
-      
+
       Vector<BaseFloat> clust_center(dim);
       clust_center.SetRandn();
       for (size_t k = 0; k < n_points; k++) {
@@ -573,5 +574,3 @@ int main() {
   TestClusterBottomUp();
   TestRefineClusters();
 }
-
-
diff --git a/src/util/kaldi-pipebuf.h b/src/util/kaldi-pipebuf.h
index 9b83cdccc3d..61034ac2757 100644
--- a/src/util/kaldi-pipebuf.h
+++ b/src/util/kaldi-pipebuf.h
@@ -82,7 +82,6 @@ class basic_pipebuf : public std::basic_filebuf<CharType, Traits> {
 };  // class basic_pipebuf
 #endif  // _MSC_VER
 
-};  // namespace kaldi
+}  // namespace kaldi
 
 #endif  // KALDI_UTIL_KALDI_PIPEBUF_H_
-
diff --git a/src/util/kaldi-table.h b/src/util/kaldi-table.h
index e3a80b2743b..bb7177ad051 100644
--- a/src/util/kaldi-table.h
+++ b/src/util/kaldi-table.h
@@ -383,8 +383,7 @@ class TableWriter {
   // Returns true if open for writing.
   bool IsOpen() const;
 
-  // Write the object.  Throws  std::runtime_error on error (via the
-  // KALDI_ERR macro)
+  // Write the object. Throws KaldiFatalError on error via the KALDI_ERR macro.
   inline void Write(const std::string &key, const T &value) const;
 
 
diff --git a/src/util/parse-options.cc b/src/util/parse-options.cc
index 2f75cb655f9..667d9e91c94 100644
--- a/src/util/parse-options.cc
+++ b/src/util/parse-options.cc
@@ -323,14 +323,7 @@ int ParseOptions::Read(int argc, const char *const argv[]) {
 #else
     const char *c = strrchr(argv[0], '/');
 #endif
-    if (c == NULL)
-      c = argv[0];
-    else
-      c++;
-    char *program_name = new char[strlen(c)+1];
-    strcpy(program_name, c);
-    delete [] g_program_name;
-    g_program_name = program_name;
+    SetProgramName(c == NULL ? argv[0] : c + 1);
   }
   // first pass: look for config parameter, look for priority
   for (i = 1; i < argc; i++) {
diff --git a/tools/extras/install_portaudio.sh b/tools/extras/install_portaudio.sh
index ed9529477a6..58797f554e8 100755
--- a/tools/extras/install_portaudio.sh
+++ b/tools/extras/install_portaudio.sh
@@ -82,7 +82,7 @@ if [ -z "$MACOS" ]; then
 fi
 
 ./configure --prefix=`pwd`/install --with-pic
-sed -i.bk '40s:src/common/pa_ringbuffer.o::g; 40s:$: src/common/pa_ringbuffer.o:' Makefile
+perl -i -pe 's:src/common/pa_ringbuffer.o:: if /^OTHER_OBJS\s*=/' Makefile
 
 if [ "$MACOS" != "" ]; then
     echo "detected MacOS operating system ... trying to fix Makefile"