diff --git a/egs/aishell/s5/local/aishell_train_lms.sh b/egs/aishell/s5/local/aishell_train_lms.sh
index ea72614689d..9b6cdad2960 100755
--- a/egs/aishell/s5/local/aishell_train_lms.sh
+++ b/egs/aishell/s5/local/aishell_train_lms.sh
@@ -23,7 +23,7 @@ kaldi_lm=`which train_lm.sh`
 if [ -z $kaldi_lm ]; then
   echo "$0: train_lm.sh is not found. That might mean it's not installed"
   echo "$0: or it is not added to PATH"
-  echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it"
+  echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it"
   exit 1
 fi
 
diff --git a/egs/aishell2/s5/local/train_lms.sh b/egs/aishell2/s5/local/train_lms.sh
index fbe95d898a1..179a7b78e14 100755
--- a/egs/aishell2/s5/local/train_lms.sh
+++ b/egs/aishell2/s5/local/train_lms.sh
@@ -24,7 +24,7 @@ kaldi_lm=`which train_lm.sh`
 if [ -z $kaldi_lm ]; then
   echo "$0: train_lm.sh is not found. That might mean it's not installed"
   echo "$0: or it is not added to PATH"
-  echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it"
+  echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it"
   exit 1
 fi
 
diff --git a/egs/callhome_diarization/v1/local/make_swbd2_phase1.pl b/egs/callhome_diarization/v1/local/make_swbd2_phase1.pl
new file mode 100755
index 00000000000..71b26b55de5
--- /dev/null
+++ b/egs/callhome_diarization/v1/local/make_swbd2_phase1.pl
@@ -0,0 +1,106 @@
+#!/usr/bin/perl
+use warnings; #sed replacement for -w perl parameter
+#
+# Copyright   2017   David Snyder
+# Apache 2.0
+
+if (@ARGV != 2) {
+  print STDERR "Usage: $0 <path-to-LDC98S75> <path-to-output>\n";
+  print STDERR "e.g. $0 /export/corpora3/LDC/LDC98S75 data/swbd2_phase1_train\n";
+  exit(1);
+}
+($db_base, $out_dir) = @ARGV;
+
+if (system("mkdir -p $out_dir")) {
+  die "Error making directory $out_dir";
+}
+
+open(CS, "<$db_base/doc/callstat.tbl") || die  "Could not open $db_base/doc/callstat.tbl";
+open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
+open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
+open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
+
+@badAudio = ("3", "4");
+
+$tmp_dir = "$out_dir/tmp";
+if (system("mkdir -p $tmp_dir") != 0) {
+  die "Error making directory $tmp_dir";
+}
+
+if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) {
+  die "Error getting list of sph files";
+}
+
+open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list";
+
+%wavs = ();
+while(<WAVLIST>) {
+  chomp;
+  $sph = $_;
+  @t = split("/",$sph);
+  @t1 = split("[./]",$t[$#t]);
+  $uttId = $t1[0];
+  $wavs{$uttId} = $sph;
+}
+
+while (<CS>) {
+  $line = $_ ;
+  @A = split(",", $line);
+  @A1 = split("[./]",$A[0]);
+  $wav = $A1[0];
+  if (/$wav/i ~~ @badAudio) {
+    # do nothing
+    print "Bad Audio = $wav";
+  } else {
+    $spkr1= "sw_" . $A[2];
+    $spkr2= "sw_" . $A[3];
+    $gender1 = $A[5];
+    $gender2 = $A[6];
+    if ($gender1 eq "M") {
+      $gender1 = "m";
+    } elsif ($gender1 eq "F") {
+      $gender1 = "f";
+    } else {
+      die "Unknown Gender in $line";
+    }
+    if ($gender2 eq "M") {
+      $gender2 = "m";
+    } elsif ($gender2 eq "F") {
+      $gender2 = "f";
+    } else {
+      die "Unknown Gender in $line";
+    }
+    if (-e "$wavs{$wav}") {
+      $uttId = $spkr1 ."_" . $wav ."_1";
+      if (!$spk2gender{$spkr1}) {
+        $spk2gender{$spkr1} = $gender1;
+        print GNDR "$spkr1"," $gender1\n";
+      }
+      print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wavs{$wav} |\n";
+      print SPKR "$uttId"," $spkr1","\n";
+
+      $uttId = $spkr2 . "_" . $wav ."_2";
+      if (!$spk2gender{$spkr2}) {
+        $spk2gender{$spkr2} = $gender2;
+        print GNDR "$spkr2"," $gender2\n";
+      }
+      print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wavs{$wav} |\n";
+      print SPKR "$uttId"," $spkr2","\n";
+    } else {
+      print STDERR "Missing $wavs{$wav} for $wav\n";
+    }
+  }
+}
+
+close(WAV) || die;
+close(SPKR) || die;
+close(GNDR) || die;
+if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+if (system("utils/fix_data_dir.sh $out_dir") != 0) {
+  die "Error fixing data dir $out_dir";
+}
+if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}
diff --git a/egs/heroico/s5/RESULTS b/egs/heroico/s5/RESULTS
index 9717e95e6e2..7942c03b1d9 100644
--- a/egs/heroico/s5/RESULTS
+++ b/egs/heroico/s5/RESULTS
@@ -1,22 +1,48 @@
 # for dir in $(echo exp/tri*/decode* | grep -v 'si/'); do grep WER $dir/wer* | utils/best_wer.sh; done
 
-%WER 67.01 [ 5126 / 7650, 837 ins, 575 del, 3714 sub ] exp/tri1/decode_devtest/wer_14_1.0
-%WER 62.39 [ 4678 / 7498, 768 ins, 397 del, 3513 sub ] exp/tri1/decode_native/wer_13_1.0
-%WER 67.05 [ 6179 / 9215, 895 ins, 606 del, 4678 sub ] exp/tri1/decode_nonnative/wer_13_1.0
-%WER 64.97 [ 10859 / 16713, 1678 ins, 999 del, 8182 sub ] exp/tri1/decode_test/wer_13_1.0
-%WER 65.90 [ 5041 / 7650, 1016 ins, 416 del, 3609 sub ] exp/tri2b/decode_devtest/wer_12_1.0
-%WER 61.26 [ 4593 / 7498, 908 ins, 300 del, 3385 sub ] exp/tri2b/decode_native/wer_14_1.0
-%WER 67.51 [ 6221 / 9215, 1085 ins, 524 del, 4612 sub ] exp/tri2b/decode_nonnative/wer_14_1.0
-%WER 64.87 [ 10842 / 16713, 2004 ins, 838 del, 8000 sub ] exp/tri2b/decode_test/wer_14_1.0
-%WER 66.09 [ 5056 / 7650, 1078 ins, 402 del, 3576 sub ] exp/tri3b/decode_devtest/wer_16_1.0
-%WER 74.88 [ 5728 / 7650, 1210 ins, 426 del, 4092 sub ] exp/tri3b/decode_devtest.si/wer_15_1.0
-%WER 61.19 [ 4588 / 7498, 1038 ins, 255 del, 3295 sub ] exp/tri3b/decode_native/wer_14_1.0
-%WER 70.99 [ 5323 / 7498, 1185 ins, 301 del, 3837 sub ] exp/tri3b/decode_native.si/wer_16_1.0
-%WER 66.35 [ 6114 / 9215, 1186 ins, 421 del, 4507 sub ] exp/tri3b/decode_nonnative/wer_17_1.0
-%WER 76.36 [ 7037 / 9215, 1420 ins, 467 del, 5150 sub ] exp/tri3b/decode_nonnative.si/wer_16_1.0
-%WER 64.06 [ 10706 / 16713, 2245 ins, 657 del, 7804 sub ] exp/tri3b/decode_test/wer_15_1.0
-%WER 73.97 [ 12362 / 16713, 2608 ins, 766 del, 8988 sub ] exp/tri3b/decode_test.si/wer_16_1.0
-%WER 53.07 [ 4060 / 7650, 744 ins, 376 del, 2940 sub ] exp/chain/tdnn1e_sp/decode_devtest/wer_7_1.0
-%WER 54.47 [ 4084 / 7498, 536 ins, 475 del, 3073 sub ] exp/chain/tdnn1e_sp/decode_native/wer_7_1.0
-%WER 63.01 [ 5806 / 9215, 685 ins, 784 del, 4337 sub ] exp/chain/tdnn1e_sp/decode_nonnative/wer_7_1.0
-%WER 59.25 [ 9903 / 16713, 1226 ins, 1259 del, 7418 sub ] exp/chain/tdnn1e_sp/decode_test/wer_7_1.0
+# old results before adding Movie subtitles text corpus in LM training:
+# %WER 67.01 [ 5126 / 7650, 837 ins, 575 del, 3714 sub ] exp/tri1/decode_devtest/wer_14_1.0
+# %WER 62.39 [ 4678 / 7498, 768 ins, 397 del, 3513 sub ] exp/tri1/decode_native/wer_13_1.0
+# %WER 67.05 [ 6179 / 9215, 895 ins, 606 del, 4678 sub ] exp/tri1/decode_nonnative/wer_13_1.0
+# %WER 64.97 [ 10859 / 16713, 1678 ins, 999 del, 8182 sub ] exp/tri1/decode_test/wer_13_1.0
+# %WER 65.90 [ 5041 / 7650, 1016 ins, 416 del, 3609 sub ] exp/tri2b/decode_devtest/wer_12_1.0
+# %WER 61.26 [ 4593 / 7498, 908 ins, 300 del, 3385 sub ] exp/tri2b/decode_native/wer_14_1.0
+# %WER 67.51 [ 6221 / 9215, 1085 ins, 524 del, 4612 sub ] exp/tri2b/decode_nonnative/wer_14_1.0
+# %WER 64.87 [ 10842 / 16713, 2004 ins, 838 del, 8000 sub ] exp/tri2b/decode_test/wer_14_1.0
+# %WER 66.09 [ 5056 / 7650, 1078 ins, 402 del, 3576 sub ] exp/tri3b/decode_devtest/wer_16_1.0
+# %WER 74.88 [ 5728 / 7650, 1210 ins, 426 del, 4092 sub ] exp/tri3b/decode_devtest.si/wer_15_1.0
+# %WER 61.19 [ 4588 / 7498, 1038 ins, 255 del, 3295 sub ] exp/tri3b/decode_native/wer_14_1.0
+# %WER 70.99 [ 5323 / 7498, 1185 ins, 301 del, 3837 sub ] exp/tri3b/decode_native.si/wer_16_1.0
+# %WER 66.35 [ 6114 / 9215, 1186 ins, 421 del, 4507 sub ] exp/tri3b/decode_nonnative/wer_17_1.0
+# %WER 76.36 [ 7037 / 9215, 1420 ins, 467 del, 5150 sub ] exp/tri3b/decode_nonnative.si/wer_16_1.0
+# %WER 64.06 [ 10706 / 16713, 2245 ins, 657 del, 7804 sub ] exp/tri3b/decode_test/wer_15_1.0
+# %WER 73.97 [ 12362 / 16713, 2608 ins, 766 del, 8988 sub ] exp/tri3b/decode_test.si/wer_16_1.0
+# %WER 53.07 [ 4060 / 7650, 744 ins, 376 del, 2940 sub ] exp/chain/tdnn1e_sp/decode_devtest/wer_7_1.0
+# %WER 54.47 [ 4084 / 7498, 536 ins, 475 del, 3073 sub ] exp/chain/tdnn1e_sp/decode_native/wer_7_1.0
+# %WER 63.01 [ 5806 / 9215, 685 ins, 784 del, 4337 sub ] exp/chain/tdnn1e_sp/decode_nonnative/wer_7_1.0
+# %WER 59.25 [ 9903 / 16713, 1226 ins, 1259 del, 7418 sub ] exp/chain/tdnn1e_sp/decode_test/wer_7_1.0
+
+# new results:
+%WER 18.27 [ 1398 / 7650, 213 ins, 253 del, 932 sub ] exp/tri1/decode_devtest/wer_15_0.5
+%WER 9.95 [ 746 / 7498, 74 ins, 108 del, 564 sub ] exp/tri1/decode_native/wer_13_0.5
+%WER 16.63 [ 1532 / 9215, 197 ins, 183 del, 1152 sub ] exp/tri1/decode_nonnative/wer_17_0.0
+%WER 13.68 [ 2287 / 16713, 207 ins, 360 del, 1720 sub ] exp/tri1/decode_test/wer_17_0.5
+%WER 17.19 [ 1315 / 7650, 227 ins, 231 del, 857 sub ] exp/tri2b/decode_devtest/wer_17_0.5
+%WER 9.23 [ 692 / 7498, 60 ins, 103 del, 529 sub ] exp/tri2b/decode_native/wer_16_0.5
+%WER 17.16 [ 1581 / 9215, 184 ins, 216 del, 1181 sub ] exp/tri2b/decode_nonnative/wer_17_0.5
+%WER 13.64 [ 2279 / 16713, 241 ins, 326 del, 1712 sub ] exp/tri2b/decode_test/wer_17_0.5
+%WER 15.36 [ 1175 / 7650, 212 ins, 210 del, 753 sub ] exp/tri3b/decode_devtest/wer_17_0.5
+%WER 20.27 [ 1551 / 7650, 269 ins, 257 del, 1025 sub ] exp/tri3b/decode_devtest.si/wer_14_1.0
+%WER 6.40 [ 480 / 7498, 50 ins, 58 del, 372 sub ] exp/tri3b/decode_native/wer_16_0.0
+%WER 10.91 [ 818 / 7498, 100 ins, 112 del, 606 sub ] exp/tri3b/decode_native.si/wer_16_1.0
+%WER 14.30 [ 1318 / 9215, 206 ins, 134 del, 978 sub ] exp/tri3b/decode_nonnative/wer_17_0.0
+%WER 21.62 [ 1992 / 9215, 286 ins, 224 del, 1482 sub ] exp/tri3b/decode_nonnative.si/wer_16_1.0
+%WER 10.78 [ 1802 / 16713, 247 ins, 195 del, 1360 sub ] exp/tri3b/decode_test/wer_17_0.0
+%WER 16.81 [ 2809 / 16713, 374 ins, 338 del, 2097 sub ] exp/tri3b/decode_test.si/wer_16_1.0
+
+# chain model results:
+# for dir in $(echo exp/chain/tdnn1b_sp/decode* | grep -v 'si/'); do grep WER $dir/wer* | utils/best_wer.sh; done
+%WER 12.99 [ 994 / 7650, 192 ins, 163 del, 639 sub ] exp/chain/tdnn1b_sp/decode_devtest/wer_10_1.0
+%WER 12.47 [ 1149 / 9215, 119 ins, 174 del, 856 sub ] exp/chain/tdnn1b_sp/decode_nonnative/wer_12_0.0
+%WER 9.64 [ 1611 / 16713, 169 ins, 240 del, 1202 sub ] exp/chain/tdnn1b_sp/decode_test/wer_12_0.0
+%WER 6.13 [ 460 / 7498, 52 ins, 55 del, 353 sub ] exp/chain/tdnn1b_sp/decode_native/wer_10_0.0
diff --git a/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a10.sh b/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
similarity index 91%
rename from egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a10.sh
rename to egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
index ef4824bf7f2..1112f0ec08b 100755
--- a/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a10.sh
+++ b/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -1,13 +1,11 @@
 #!/bin/bash
 
-
-# run_cnn_tdnn_1a10.sh is modified from run_tdnn_1b.sh but taking
+# run_cnn_tdnn_1a.sh is modified from run_tdnn_1b.sh but taking
 #   the xconfig from mini-librispeech's run_cnn_tdnn_1a54.sh; only
 #   reducing the bottleneck-dim from 96 to 64, which is the value
-#   the run_tdnn1b.sh script here has.
-# Better!
-# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp exp/chain/cnn_tdnn1a10_sp
-# System                  tdnn1a_sp tdnn1b_sp cnn_tdnn1a10_sp
+#   the run_tdnn1b.sh script here has. Results are better.
+# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp exp/chain/cnn_tdnn1a_sp
+# System                  tdnn1a_sp tdnn1b_sp cnn_tdnn1a_sp
 # %WER        devtest       53.07     52.54     51.10
 # %WER           test       59.25     53.70     52.07
 # %WER         native       54.47     48.76     47.88
@@ -18,27 +16,6 @@
 # Final valid prob (xent)   -1.0719   -1.0849   -0.9915
 # Num-params                 6567648   3321312   3345088
 
-
-
-# 1b is as 1a but a re-tuned model with quite a few changes, including moving to
-#   a resnet-style factored TDNN-F model.
-#
-# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp
-# System                  tdnn1a_sp tdnn1b_sp
-# %WER        devtest       53.07     52.54
-# %WER           test       59.25     53.70
-# %WER         native       54.47     48.76
-# %WER      nonnative       63.01     57.66
-# Final train prob          -0.0253   -0.0547
-# Final valid prob          -0.0687   -0.0694
-# Final train prob (xent)   -0.7715   -0.9502
-# Final valid prob (xent)   -1.0719   -1.0849
-# Num-params                 6567648   3321312
-
-
-# steps/info/chain_dir_info.pl  exp/chain/tdnn1b_sp
-# exp/chain/tdnn1b_sp: num-iters=34 nj=2..5 num-params=3.3M dim=40+100->1392 combine=-0.059->-0.059 (over 1) xent:train/valid[21,33,final]=(-1.28,-0.986,-0.950/-1.38,-1.10,-1.08) logprob:train/valid[21,33,final]=(-0.085,-0.063,-0.055/-0.090,-0.074,-0.069)
-
 # Set -e here so that we catch if any executable fails immediately
 set -euo pipefail
 
@@ -53,7 +30,7 @@ nnet3_affix=
 
 # The rest are configs specific to this script.  Most of the parameters
 # are just hardcoded at this level, in the commands below.
-affix=1a10   # affix for the TDNN directory name
+affix=1a   # affix for the TDNN directory name
 tree_affix=
 train_stage=-10
 get_egs_stage=-10
diff --git a/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh
index 4658f4d3d6d..6dde42bef79 100755
--- a/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -1,19 +1,20 @@
 #!/bin/bash
 
 # local/chain/compare_wer.sh exp/chain/tdnn1a_sp
+# ./local/chain/compare_wer.sh exp/chain/tdnn1a_sp
 # System                  tdnn1a_sp
-# %WER        devtest       53.07
-# %WER           test       59.25
-# %WER         native       54.47
-# %WER      nonnative       63.01
-# Final train prob          -0.0253
-# Final valid prob          -0.0687
-# Final train prob (xent)   -0.7715
-# Final valid prob (xent)   -1.0719
-# Num-params                 6567648
+# %WER        devtest       13.10
+# %WER           test       15.53
+# %WER         native       10.14
+# %WER      nonnative       19.78
+# Final train prob          -0.0233
+# Final valid prob          -0.0720
+# Final train prob (xent)   -0.8107
+# Final valid prob (xent)   -0.9898
+# Num-params                 6559440
 
 # steps/info/chain_dir_info.pl  exp/chain/tdnn1a_sp/
-#exp/chain/tdnn1a_sp/: num-iters=105 nj=1..1 num-params=6.6M dim=40+100->1392 combine=-0.040->-0.033 (over 7) xent:train/valid[69,104,final]=(-1.12,-0.880,-0.771/-1.33,-1.21,-1.07) logprob:train/valid[69,104,final]=(-0.050,-0.031,-0.025/-0.079,-0.080,-0.069)
+# exp/chain/tdnn1a_sp: num-iters=105 nj=1..1 num-params=6.6M dim=40+100->1384 combine=-0.032->-0.026 (over 7) xent:train/valid[69,104,final]=(-1.14,-0.892,-0.811/-1.19,-1.07,-0.990) logprob:train/valid[69,104,final]=(-0.045,-0.029,-0.023/-0.083,-0.080,-0.072)
 
 # Set -e here so that we catch if any executable fails immediately
 set -euo pipefail
diff --git a/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh
index 33ce1556d29..d255d85327f 100755
--- a/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -3,21 +3,20 @@
 # 1b is as 1a but a re-tuned model with quite a few changes, including moving to
 #   a resnet-style factored TDNN-F model.
 #
-# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp
+# ./local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp
 # System                  tdnn1a_sp tdnn1b_sp
-# %WER        devtest       53.07     52.54
-# %WER           test       59.25     53.70
-# %WER         native       54.47     48.76
-# %WER      nonnative       63.01     57.66
-# Final train prob          -0.0253   -0.0547
-# Final valid prob          -0.0687   -0.0694
-# Final train prob (xent)   -0.7715   -0.9502
-# Final valid prob (xent)   -1.0719   -1.0849
-# Num-params                 6567648   3321312
-
+# %WER        devtest       13.10     12.99
+# %WER           test       15.53      9.64
+# %WER         native       10.14      6.13
+# %WER      nonnative       19.78     12.47
+# Final train prob          -0.0233   -0.0442
+# Final valid prob          -0.0720   -0.0726
+# Final train prob (xent)   -0.8107   -0.9759
+# Final valid prob (xent)   -0.9898   -0.9964
+# Num-params                 6559440   3318224
 
 # steps/info/chain_dir_info.pl  exp/chain/tdnn1b_sp
-# exp/chain/tdnn1b_sp: num-iters=34 nj=2..5 num-params=3.3M dim=40+100->1392 combine=-0.059->-0.059 (over 1) xent:train/valid[21,33,final]=(-1.28,-0.986,-0.950/-1.38,-1.10,-1.08) logprob:train/valid[21,33,final]=(-0.085,-0.063,-0.055/-0.090,-0.074,-0.069)
+# exp/chain/tdnn1b_sp: num-iters=34 nj=2..5 num-params=3.3M dim=40+100->1384 combine=-0.044->-0.044 (over 1) xent:train/valid[21,33,final]=(-1.30,-0.993,-0.976/-1.28,-1.01,-0.996) logprob:train/valid[21,33,final]=(-0.071,-0.050,-0.044/-0.093,-0.076,-0.073)
 
 # Set -e here so that we catch if any executable fails immediately
 set -euo pipefail
diff --git a/egs/heroico/s5/local/heroico_answers_make_lists.pl b/egs/heroico/s5/local/heroico_answers_make_lists.pl
index fb3c0ecb8d1..c1a3735b4f1 100755
--- a/egs/heroico/s5/local/heroico_answers_make_lists.pl
+++ b/egs/heroico/s5/local/heroico_answers_make_lists.pl
@@ -30,7 +30,7 @@
 my $t = "$tmpdir/answers/text";
 
 # initialize hash for prompts
-my %p = ();
+my %prompts = ();
 
 # store prompts in hash
 LINEA: while ( my $line = <> ) {
@@ -40,9 +40,27 @@
   my @dirs = split /\//, $directories;
   # get the speaker number
   my $s = $dirs[-1];
+  # pad the speaker number with zeroes
+  my $spk = "";
+  if ( $s < 10 ) {
+      $spk = '000' . $s;
+  } elsif ( $s < 100 ) {
+      $spk = '00' . $s;
+  } elsif ( $s < 1000 ) {
+      $spk = '0' . $s;
+  }
+  # pad the filename with zeroes
+  my $fn = "";
+  if ( $file < 10 ) {
+      $fn = '000' . $file;
+  } elsif ( $file < 100 ) {
+      $fn = '00' . $file;
+  } elsif ( $file < 1000 ) {
+      $fn = '0' . $file;
+  }
   # the utterance name
-  my $i = $s . '_' . 'a' . '_' . $file;
-  $p{$i} = $sent;
+  my $utt = $spk . '_' . $fn;
+  $prompts{$utt} = $sent;
 }
 
 open my $W, '<', $w or croak "problem with $w $!";
@@ -58,18 +76,36 @@
   my @dirs = split /\//, $directories;
   my $r = basename $line, ".wav";
   my $s = $dirs[-1];
-  my $rid = $s . '_' . 'a' . '_' . $r;
-  if ( exists $p{$rid} ) {
-    print $T "$rid $p{$rid}\n";
-  } elsif ( defined $rid ) {
-    warn  "warning: problem\t$rid";
+  my $spk = "";
+  # pad with zeroes
+  if ( $s < 10 ) {
+      $spk = '000' . $s;
+  } elsif ( $s < 100 ) {
+      $spk = '00' . $s;
+  } elsif ( $s < 1000 ) {
+      $spk = '0' . $s;
+  }
+  # pad the file name with zeroes
+  my $rec = "";
+  if ( $r < 10 ) {
+      $rec = '000' . $r;
+  } elsif ( $r < 100 ) {
+      $rec = '00' . $r;
+  } elsif ( $r < 1000 ) {
+      $rec = '0' . $r;
+  }
+  my $rec_id = $spk . '_' . $rec;
+  if ( exists $prompts{$rec_id} ) {
+    print $T "$rec_id $prompts{$rec_id}\n";
+  } elsif ( defined $rec_id ) {
+    warn  "warning: problem\t$rec_id";
     next LINE;
   } else {
     croak "$line";
   }
 
-  print $O "$rid sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n";
-  print $U "$rid ${s}_a\n";
+  print $O "$rec_id sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n";
+  print $U "$rec_id $spk\n";
 }
 close $T;
 close $O;
diff --git a/egs/heroico/s5/local/heroico_download.sh b/egs/heroico/s5/local/heroico_download.sh
new file mode 100755
index 00000000000..9c58fe37537
--- /dev/null
+++ b/egs/heroico/s5/local/heroico_download.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# Copyright 2018 John Morgan
+# Apache 2.0.
+
+speech=$1
+lexicon=$2
+
+download_dir=$(pwd)
+tmpdir=data/local/tmp
+data_dir=$tmpdir/LDC2006S37/data
+
+mkdir -p $tmpdir
+
+# download the corpus from openslr
+
+if [ ! -f $download_dir/heroico.tar.gz ]; then
+  wget -O $download_dir/heroico.tar.gz $speech
+
+  (
+    cd $download_dir
+    tar -xzf heroico.tar.gz
+  )
+fi
+
+mkdir -p data/local/dict $tmpdir/dict
+
+# download the dictionary from openslr
+
+if [ ! -f $download_dir/santiago.tar.gz ]; then
+    wget -O $download_dir/santiago.tar.gz $lexicon
+fi
+
+(
+  cd $download_dir
+  tar -xzf santiago.tar.gz
+)
diff --git a/egs/heroico/s5/local/heroico_recordings_make_lists.pl b/egs/heroico/s5/local/heroico_recordings_make_lists.pl
index 1d157665799..b9a3ab5a565 100755
--- a/egs/heroico/s5/local/heroico_recordings_make_lists.pl
+++ b/egs/heroico/s5/local/heroico_recordings_make_lists.pl
@@ -19,75 +19,102 @@
 system "mkdir -p $tmpdir/recordings/devtest";
 
 # input wav file list
-my $w = "$tmpdir/wav_list.txt";
+my $input_wav_list = "$tmpdir/wav_list.txt";
 
 # output temporary wav.scp files
-my $o_train = "$tmpdir/recordings/train/wav.scp";
-my $o_test = "$tmpdir/recordings/devtest/wav.scp";
+my $train_wav_scp = "$tmpdir/recordings/train/wav.scp";
+my $test_wav_scp = "$tmpdir/recordings/devtest/wav.scp";
 
 # output temporary utt2spk files
-my $u_train = "$tmpdir/recordings/train/utt2spk";
-my $u_test = "$tmpdir/recordings/devtest/utt2spk";
+my $train_uttspk = "$tmpdir/recordings/train/utt2spk";
+my $test_uttspk = "$tmpdir/recordings/devtest/utt2spk";
 
 # output temporary text files
-my $t_train = "$tmpdir/recordings/train/text";
-my $t_test = "$tmpdir/recordings/devtest/text";
+my $train_text = "$tmpdir/recordings/train/text";
+my $test_text = "$tmpdir/recordings/devtest/text";
 
 # initialize hash for prompts
-my %p = ();
+my %prompts = ();
 
 # store prompts in hash
 LINEA: while ( my $line = <> ) {
     chomp $line;
-    my ($s,$sent) = split /\t/, $line, 2;
-    $p{$s} = $sent;
+    my ($prompt_id,$prompt) = split /\t/, $line, 2;
+    # pad the prompt id with zeroes
+    my $pid = "";
+    if ( $prompt_id < 10 ) {
+	$pid = '0000' . $prompt_id;
+    } elsif ( $prompt_id < 100 ) {
+	$pid = '000' . $prompt_id;
+    } elsif ( $prompt_id < 1000 ) {
+	$pid = '00' . $prompt_id;
+    }
+    $prompts{$pid} = $prompt;
 }
 
-open my $W, '<', $w or croak "problem with $w $!";
-open my $OT, '+>', $o_train or croak "problem with $o_train $!";
-open my $OE, '+>', $o_test or croak "problem with $o_test $!";
-open my $UT, '+>', $u_train or croak "problem with $u_train $!";
-open my $UE, '+>', $u_test or croak "problem with $u_test $!";
-open my $TT, '+>', $t_train or croak "problem with $t_train $!";
-open my $TE, '+>', $t_test or croak "problem with $t_test $!";
+open my $WVL, '<', $input_wav_list or croak "problem with $input_wav_list $!";
+open my $TRNWSCP, '+>', $train_wav_scp or croak "problem with $train_wav_scp $!";
+open my $TSTWSCP, '+>', $test_wav_scp or croak "problem with $test_wav_scp $!";
+open my $TRNUTTSPK, '+>', $train_uttspk or croak "problem with $train_uttspk $!";
+open my $TSTUTTSPK, '+>', $test_uttspk or croak "problem with $test_uttspk $!";
+open my $TRNTXT, '+>', $train_text or croak "problem with $train_text $!";
+open my $TSTTXT, '+>', $test_text or croak "problem with $test_text $!";
 
- LINE: while ( my $line = <$W> ) {
+ LINE: while ( my $line = <$WVL> ) {
      chomp $line;
      next LINE if ($line =~ /Answers/ );
      next LINE unless ( $line =~ /Recordings/ );
      my ($volume,$directories,$file) = File::Spec->splitpath( $line );
      my @dirs = split /\//, $directories;
-     my $r = basename $line, ".wav";
-     my $s = $dirs[-1];
-     my $rid = $s . '_r' . '_' . $r;
-     if ( ( $r >= 355 ) and ( $r < 561 ) ) {
-	 if ( exists $p{$r} ) {
-	     print $TE "$rid $p{$r}\n";
-	 } elsif ( defined $rid ) {
-	     warn  "problem\t$rid";
+     my $utt_id = basename $line, ".wav";
+     # pad the utterance id with zeroes
+     my $utt = "";
+     if ( $utt_id < 10 ) {
+     $utt = '0000' . $utt_id;
+} elsif ( $utt_id < 100 ) {
+    $utt = '000' . $utt_id;
+} elsif ( $utt_id < 1000 ) {
+    $utt = '00' . $utt_id;
+}
+     my $spk_id = $dirs[-1];
+     # pad the speaker id with zeroes
+     my $spk = "";
+     if ( $spk_id < 10 ) {
+	 $spk = '000' . $spk_id;
+     } elsif ( $spk_id < 100 ) {
+	 $spk = '00' . $spk_id;
+     } elsif ( $spk_id < 1000 ) {
+	 $spk = '0' . $spk_id;
+     }
+     my $spk_utt_id = $spk . '_' . $utt;
+     if ( ( $utt_id >= 355 ) and ( $utt_id < 561 ) ) {
+if ( exists $prompts{$utt} ) {
+	     print $TSTTXT "$spk_utt_id $prompts{$utt}\n";
+	 } elsif ( defined $spk_utt_id ) {
+	     warn  "problem\t$spk_utt_id";
 	     next LINE;
 	 } else {
 	     croak "$line";
 	 }
-	 print $OE "$rid sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n";
-	 print $UE "$rid ${s}_r\n";
-     } elsif ( ( $r < 355 ) or ( $r > 560 ) ) {
-	 if ( exists $p{$r} ) {
-	     print $TT "$rid $p{$r}\n";
-	 } elsif ( defined $rid ) {
-	     warn  "problem\t$rid";
+	 print $TSTWSCP "$spk_utt_id sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n";
+	 print $TSTUTTSPK "$spk_utt_id $spk\n";
+     } elsif ( ( $utt_id < 355 ) or ( $utt_id > 560 ) ) {
+	 if ( exists $prompts{$utt} ) {
+	     print $TRNTXT "$spk_utt_id $prompts{$utt}\n";
+	 } elsif ( defined $spk_utt_id ) {
+	     warn  "problem\t$spk_utt_id";
 	     next LINE;
 	 } else {
 	     croak "$line";
 	 }
-	 print $OT "$rid sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n";
-	 print $UT "$rid ${s}_r\n";
-     }
+	 print $TRNWSCP "$spk_utt_id sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n";
+	 print $TRNUTTSPK "$spk_utt_id $spk\n";
+     } 
 }
-close $TT;
-close $OT;
-close $UT;
-close $TE;
-close $OE;
-close $UE;
-close $W;
+close $TRNTXT;
+close $TRNWSCP;
+close $TRNUTTSPK;
+close $TSTTXT;
+close $TSTWSCP;
+close $TSTUTTSPK;
+close $WVL;
diff --git a/egs/heroico/s5/local/nnet3/run_ivector_common.sh b/egs/heroico/s5/local/nnet3/run_ivector_common.sh
index 153f0073667..e882ce0c918 100755
--- a/egs/heroico/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/heroico/s5/local/nnet3/run_ivector_common.sh
@@ -9,6 +9,9 @@ set -euo pipefail
 # of usage.
 
 stage=0
+nj=56
+num_threads_ubm=2
+
 train_set=train
 test_sets="native nonnative devtest test"
 gmm=tri3b
@@ -37,25 +40,17 @@ if [ $stage -le 1 ]; then
     utils/data/perturb_data_dir_speed_3way.sh \
 	data/${train_set} \
 	data/${train_set}_sp
-    echo "$0: making MFCC features for low-resolution speed-perturbed data"
-    steps/make_mfcc.sh \
-	--cmd "$train_cmd" \
-	--nj 10 \
-	data/${train_set}_sp || exit 1;
-    steps/compute_cmvn_stats.sh \
-	data/${train_set}_sp || exit 1;
-    utils/fix_data_dir.sh \
-	data/${train_set}_sp
+
+    echo "$0: making mfcc features for low-resolution speed-perturbed data"
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/${train_set}_sp || exit 1;
+    steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+    utils/fix_data_dir.sh data/${train_set}_sp
 fi
 
 if [ $stage -le 2 ]; then
     echo "$0: aligning with the perturbed low-resolution data"
     steps/align_fmllr.sh \
-	--nj 20 \
-	--cmd "$train_cmd" \
-	data/${train_set}_sp \
-	data/lang \
-	$gmm_dir \
+	--nj 20 --cmd "$train_cmd" data/${train_set}_sp data/lang $gmm_dir \
 	$ali_dir || exit 1
 fi
 
diff --git a/egs/heroico/s5/local/prepare_data.sh b/egs/heroico/s5/local/prepare_data.sh
index db2b990c07b..b78d9f1d1cb 100755
--- a/egs/heroico/s5/local/prepare_data.sh
+++ b/egs/heroico/s5/local/prepare_data.sh
@@ -4,17 +4,17 @@
 # Apache 2.0.
 
 . ./cmd.sh
-
 . ./path.sh
 stage=0
+datadir=$1
 
 . ./utils/parse_options.sh
 
 set -e
 set -o pipefail
 
-# the location of the LDC corpus
-datadir=$1
+tmpdir=data/local/tmp
+
 # acoustic models are trained on the heroico corpus
 # testing is done on the usma corpus
 # heroico consists of 2 parts: answers and recordings (recited)
@@ -25,8 +25,6 @@ recordings_transcripts=$datadir/data/transcripts/heroico-recordings.txt
 # usma is all recited
 usma_transcripts=$datadir/data/transcripts/usma-prompts.txt
 
-tmpdir=data/local/tmp
-
 # make acoustic model training  lists
 if [ $stage -le 0 ]; then
   mkdir -p $tmpdir/heroico $tmpdir/usma
@@ -37,12 +35,12 @@ if [ $stage -le 0 ]; then
   # the transcripts are converted to UTF8
   export LC_ALL=en_US.UTF-8
   cat $answers_transcripts  | iconv -f ISO-8859-1 -t UTF-8 | \
-    sed -e 's/\r//' |  local/heroico_answers_make_lists.pl
+    tr -d '\r' |  local/heroico_answers_make_lists.pl
 
   utils/fix_data_dir.sh $tmpdir/heroico/answers
 
   cat $recordings_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \
-    sed -e 's/\r//' | local/heroico_recordings_make_lists.pl
+    tr -d '\r' | local/heroico_recordings_make_lists.pl
 
   utils/fix_data_dir.sh $tmpdir/heroico/recordings/train
   utils/fix_data_dir.sh $tmpdir/heroico/recordings/devtest
@@ -52,11 +50,11 @@ if [ $stage -le 0 ]; then
 
   for x in wav.scp utt2spk text; do
     cat $tmpdir/heroico/answers/$x $tmpdir/heroico/recordings/train/$x | \
-      sed -e 's/\r//' | sort -k1,1 -u >$tmpdir/heroico/lists/train/$x
+      tr -d '\r' | sort -k1,1 -u >$tmpdir/heroico/lists/train/$x
   done
 
   for x in wav.scp utt2spk text; do
-    cat $tmpdir/heroico/recordings/devtest/$x | sed -e 's/\r//' | \
+    cat $tmpdir/heroico/recordings/devtest/$x | tr -d '\r' | \
       sort -k1,1 -u >$tmpdir/heroico/lists/devtest/$x
   done
 
@@ -67,10 +65,10 @@ fi
 if [ $stage -le 1 ]; then
   #  make separate lists for usma (US military academy) native and nonnative
   cat $usma_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \
-    sed -e 's/\r//' | local/usma_native_make_lists.pl
+    tr -d '\r' | dos2unix | local/usma_native_make_lists.pl
 
   cat $usma_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \
-    sed -e 's/\r//' | local/usma_nonnative_make_lists.pl
+    tr -d '\r' | local/usma_nonnative_make_lists.pl
 
   for n in native nonnative; do
     mkdir -p $tmpdir/usma/$n/lists
@@ -86,14 +84,14 @@ if [ $stage -le 1 ]; then
   # get training lists
   for x in wav.scp utt2spk text; do
     cat $tmpdir/heroico/answers/${x} $tmpdir/heroico/recordings/train/${x} | \
-      sed -e 's/\r//' >$tmpdir/lists/train/$x
+      tr -d '\r' >$tmpdir/lists/train/$x
     sort $tmpdir/lists/train/$x >data/train/$x
   done
 
   # get devtest lists
   for x in wav.scp utt2spk text; do
     cat $tmpdir/heroico/lists/devtest/$x | \
-      sed -e 's/\r//' >$tmpdir/lists/devtest/$x
+       tr -d '\r' >$tmpdir/lists/devtest/$x
     sort $tmpdir/lists/devtest/$x >data/devtest/$x
   done
 
diff --git a/egs/heroico/s5/local/prepare_dict.sh b/egs/heroico/s5/local/prepare_dict.sh
index a6d182a6852..9f498bc963a 100755
--- a/egs/heroico/s5/local/prepare_dict.sh
+++ b/egs/heroico/s5/local/prepare_dict.sh
@@ -13,12 +13,12 @@ fi
 
 export LC_ALL=C
 
-cut -f2- data/local/tmp/dict/santiago.txt | \
+cut -f2- ./santiago.txt | \
   tr -s '[:space:]' '[\n*]' | \
     grep -v SPN | sort -u  >data/local/dict/nonsilence_phones.txt
 
 # sed "1d" deletes the last line.
-expand -t 1 data/local/tmp/dict/santiago.txt | sort -u |
+expand -t 1 ./santiago.txt | sort -u |
    sed "1d" >data/local/dict/lexicon.txt
 
 echo "<UNK> SPN" >> data/local/dict/lexicon.txt
diff --git a/egs/heroico/s5/local/subs_download.sh b/egs/heroico/s5/local/subs_download.sh
new file mode 100755
index 00000000000..98dcb42d4e0
--- /dev/null
+++ b/egs/heroico/s5/local/subs_download.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Copyright 2017 John Morgan
+# Apache 2.0.
+
+tmpdir=data/local/tmp
+download_dir=$(pwd)
+mkdir -p $download_dir
+subs_src=$1
+
+# download the subs corpus
+if [ ! -f $download_dir/subs.zip ]; then
+  wget -O $download_dir/subs.zip $subs_src
+  (
+    cd $download_dir
+    unzip subs.zip
+  )
+  else
+    echo "$0: subs file already downloaded."
+fi
diff --git a/egs/heroico/s5/local/subs_prepare_data.pl b/egs/heroico/s5/local/subs_prepare_data.pl
index 3cd906d4699..a7e0cfb0c6e 100755
--- a/egs/heroico/s5/local/subs_prepare_data.pl
+++ b/egs/heroico/s5/local/subs_prepare_data.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
 
 # Copyright 2017 John Morgan
 # Apache 2.0.
@@ -12,69 +12,64 @@
 use Encode;
 
 # set lower and upper bounds
-my $lb = 8;
-# only segments with at least  $lb words will be written
-my $ub = 16;
-# only segments with fewer than $ub words will be written
+my $low_bound = 8;
+# only segments with at least  $low_bound words will be written
+my $up_bound = 16;
+# only segments with fewer than $up_bound words will be written
 
 # input and output files
-my $c = "data/local/tmp/subs/OpenSubtitles2016.en-es.es";
-my $symtab = "data/lang/words.txt";
-my $rl = "data/local/tmp/subs/lm/es.txt";
-my $oo = "data/local/tmp/subs/lm/oovs.txt";
+
+my $corpus = "OpenSubtitles2018.en-es.es";
+my $symbol_table = "data/lang/words.txt";
+my $filtered = "data/local/tmp/subs/lm/es.txt";
+my $oovs = "data/local/tmp/subs/lm/oovs.txt";
 my $iv = "data/local/tmp/subs/lm/in_vocabulary.txt";
 
-open my $C, '<', $c or croak "problems with $c $!";
+open my $C, '<', $corpus or croak "problems with $corpus $!";
 
 system "mkdir -p data/local/tmp/subs/lm";
 
-open my $RL, '+>:utf8', $rl or croak "problems with $rl $!";
-
-LINE: while ( my $line = <$C> ) {
-    $line = decode_utf8 $line;
-    chomp $line;
-
-    my @tokens = split /\s+/, $line;
-
-    next LINE if ( ($#tokens < $lb) or ($#tokens > $ub ));
-
-    #remove control characters
-    #$line =~ s/(\p{Other})/ /g;
-    #$line =~ s/(\p{Control})/ /g;
-    #$line =~ s/(\p{Format})/ /g;
-    #$line =~ s/(\p{Private_Use})/ /g;
-    #$line =~ s/(\p{Surrogate})/ /g;
-
-    # punctuation
-    $line =~ s/(\p{Punctuation}+|\p{Dash_Punctuation}+|\p{Close_Punctuation}+|\p{Open_Punctuation}+|\p{Initial_Punctuation}+|\p{Final_Punctuation}+|\p{Connector_Punctuation}+|\p{Other_Punctuation}+|[	 ]+)/ /msxg;
-#convert tabs to white space
-    $line =~ s/\t/ /g;
-    #hard to soft space
-    $line =~ s/ / /g;
-#squeeze white space
-    $line =~ s/\s+/ /g;
-#initial and final white space
-    $line =~ s/^\p{Separator}+//;
-    $line =~ s/\p{Separator}+$//;
-#down case
-    $line = lc $line;
-
-
-    print $RL "$line\n";
-
+if ( -e $filtered ) {
+    warn "$filtered already exists.";
+} else {
+  open my $FLT, '+>:utf8', $filtered or croak "problems with $filtered $!";
+  LINE: while ( my $line = <$C> ) {
+      $line = decode_utf8 $line;
+      chomp $line;
+
+      my @tokens = split /\s+/, $line;
+
+      next LINE if ( ($#tokens < $low_bound) or ($#tokens > $up_bound ));
+
+      # remove punctuation
+      $line =~ s/(\p{Punctuation}+|\p{Dash_Punctuation}+|\p{Close_Punctuation}+|\p{Open_Punctuation}+|\p{Initial_Punctuation}+|\p{Final_Punctuation}+|\p{Connector_Punctuation}+|\p{Other_Punctuation}+|[	 ]+)/ /msxg;
+      #convert tabs to white space
+      $line =~ s/\t/ /g;
+      #hard to soft space
+      $line =~ s/ / /g;
+      #squeeze white space
+      $line =~ s/\s+/ /g;
+      #initial and final white space
+      $line =~ s/^\p{Separator}+//;
+      $line =~ s/\p{Separator}+$//;
+      #down case
+      $line = lc $line;
+
+      print $FLT "$line\n";
+  }
+  close $FLT;
 }
-
 close $C;
-close $RL;
+
 
 # find out of vocabulary words
 
-# $symtab points to a file containing a map of symbols to integers
+# $symbol_table points to a file containing a map of symbols to integers
 
 # hash for word to integer map
 my %sym2int = ();
 
-open my $F, '<', $symtab or croak "problem with $symtab $!";
+open my $F, '<', $symbol_table or croak "problem with $symbol_table $!";
 
 # store words to int map in hash
 while( my $line = <$F>) {
@@ -84,33 +79,33 @@
 }
 close $F;
 
-open my $I, '<', $rl or croak "problem with $rl $!";
-open my $OO, '+>', $oo or croak "problems with $oo $!";
+open my $I, '<', $filtered or croak "problem with $filtered $!";
+open my $OOVS, '+>', $oovs or croak "problems with $oovs $!";
 
 while ( my $line = <$I>) {
     chomp $line;
     my @A = split /\s/, $line;
     foreach my $a (@A) {
 	if (!defined ($sym2int{$a})) {
-            print $OO "$a\n";
+            print $OOVS "$a\n";
 	}
     }
 }
-close $OO;
+close $OOVS;
 close $I;
 
 # remove segments with OOVs
 
 # store OOVS in hash
 my %oov = ();
-open my $V, '<', $oo or croak "problems with $oo $!";
+open my $V, '<', $oovs or croak "problems with $oovs $!";
 while ( my $line = <$V> ) {
     chomp $line;
     $oov{$line} = 1;
 }
 close $V;
 
-open my $L, '<', $rl or croak "problems with $rl $!";
+open my $L, '<', $filtered or croak "problems with $filtered $!";
 open my $IV, '+>', $iv or croak "problems with $iv $!";
 
 SEGMENT: while ( my $segment = <$L> ) {
diff --git a/egs/heroico/s5/run.sh b/egs/heroico/s5/run.sh
index 711bece3c66..67ad87e55f9 100755
--- a/egs/heroico/s5/run.sh
+++ b/egs/heroico/s5/run.sh
@@ -1,83 +1,72 @@
 #!/bin/bash
 
 . ./cmd.sh
-
 . ./path.sh
+
 stage=0
 
+# the location of the LDC corpus; this location works for the CLSP grid.
+datadir=/export/corpora5/LDC/LDC2006S37
+
+# The corpus and lexicon are on openslr.org
+speech="http://www.openslr.org/resources/39/LDC2006S37.tar.gz"
+lexicon="http://www.openslr.org/resources/34/santiago.tar.gz"
+
+# Location of the Movie subtitles text corpus
+subs_src="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2018/en-es.txt.zip"
+
 . utils/parse_options.sh
 
 set -e
 set -o pipefail
 set -u
 
-# the location of the LDC corpus; this location works for the CLSP grid.
-datadir=/export/corpora5/LDC/LDC2006S37
-
-#datadir=/mnt/corpora/LDC2006S37
 
-# location of subtitles text data
-# note: this is not used so I'm commenting it out; dan.
-#subsdata="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2016/en-es.txt.zip"
-lexicon="http://www.openslr.org/resources/34/santiago.tar.gz"
 # don't change tmpdir, the location is used explicitly in scripts in local/.
 tmpdir=data/local/tmp
 
 if [ $stage -le 0 ]; then
-  # prepare the lists for acoustic model training and testing
-  mkdir -p $tmpdir/heroico
-  mkdir -p $tmpdir/usma
-
-  [ ! -d "$datadir" ] && \
-    echo "$0 Data directory (LDC corpus release) does not exist" && \
-    exit 1
-  local/prepare_data.sh $datadir
+  # download the corpus from openslr
+  local/heroico_download.sh $speech $lexicon
+  # Get data for lm training
+  local/subs_download.sh $subs_src
 fi
 
 if [ $stage -le 1 ]; then
-  # prepare a dictionary
-  mkdir -p data/local/dict
-  mkdir -p data/local/tmp/dict
-
-  # download the dictionary from openslr
-  if [ ! -f data/local/tmp/dict/santiago.tar.gz ]; then
-    wget -O data/local/tmp/dict/santiago.tar.gz $lexicon
-  fi
-
-  (
-    cd $tmpdir/dict
-    tar -xzf santiago.tar.gz
-  )
+  echo "Makin lists for building models."
+  local/prepare_data.sh $datadir
+fi
 
+if [ $stage -le 2 ]; then
+  mkdir -p data/local/dict $tmpdir/dict
   local/prepare_dict.sh
+fi
 
-  # prepare the lang directory
+if [ $stage -le 3 ]; then
   utils/prepare_lang.sh \
     data/local/dict "<UNK>" \
     data/local/lang data/lang
 fi
 
-if [ $stage -le 2 ]; then
-  # use am training text to train lm
-  mkdir -p $tmpdir/heroico/lm
+if [ $stage -le 4 ]; then
+  mkdir -p $tmpdir/subs/lm
+  local/subs_prepare_data.pl
+fi
+
+if [ $stage -le 5 ]; then
   echo "point 1"
-  # get the text from data/train/text
-  cut -d " " -f 2- data/train/text > $tmpdir/heroico/lm/train.txt
-  echo "point 2"
-  # build lm
-  local/prepare_lm.sh $tmpdir/heroico/lm/train.txt
+  local/prepare_lm.sh  $tmpdir/subs/lm/in_vocabulary.txt
+fi
 
-  echo "point 3"
+if [ $stage -le 6 ]; then
+  echo "point 2"
   utils/format_lm.sh \
     data/lang data/local/lm/trigram.arpa.gz data/local/dict/lexicon.txt \
     data/lang_test
-
-  # delete temporary work
-  rm -rf data/local/tmp
 fi
 
-if [ $stage -le 3 ]; then
-  # extract acoustic features
+if [ $stage -le 7 ]; then
+  echo "$0: extracting acoustic features."
   mkdir -p exp
 
   for fld in native nonnative test devtest train; do
@@ -92,7 +81,7 @@ if [ $stage -le 3 ]; then
   done
 fi
 
-if [ $stage -le 4 ]; then
+if [ $stage -le 8 ]; then
   echo "$0 monophone training"
   steps/train_mono.sh --nj 8 --cmd "$train_cmd" data/train data/lang exp/mono || exit 1;
 
@@ -108,8 +97,7 @@ if [ $stage -le 4 ]; then
   ) &
 fi
 
-if [ $stage -le 5 ]; then
-
+if [ $stage -le 9 ]; then
   # align with monophones
   steps/align_si.sh --nj 8 --cmd "$train_cmd" \
     data/train data/lang exp/mono exp/mono_ali
@@ -131,10 +119,8 @@ if [ $stage -le 5 ]; then
 
 fi
 
-if [ $stage -le 6 ]; then
+if [ $stage -le 10 ]; then
   echo "$0: Starting delta system alignment"
-
-  # align with triphones
   steps/align_si.sh \
     --nj 8 --cmd "$train_cmd" data/train data/lang exp/tri1 exp/tri1_ali
 
@@ -156,10 +142,9 @@ if [ $stage -le 6 ]; then
   ) &
 fi
 
-if  [ $stage -le 7 ]; then
+if  [ $stage -le 11 ]; then
   echo "$0: Starting LDA+MLLT system alignment"
 
-  # align with lda and mllt adapted triphones
   steps/align_si.sh \
     --use-graphs true --nj 8 --cmd "$train_cmd" \
     data/train data/lang exp/tri2b exp/tri2b_ali
@@ -169,7 +154,6 @@ if  [ $stage -le 7 ]; then
     --cmd "$train_cmd" \
     3100 50000 data/train data/lang exp/tri2b_ali exp/tri3b
 
-  # align with tri3b models
   echo "$0 Starting exp/tri3b_ali"
   steps/align_fmllr.sh \
     --nj 8 --cmd "$train_cmd" \
@@ -182,16 +166,16 @@ if  [ $stage -le 7 ]; then
     utils/mkgraph.sh \
       data/lang_test exp/tri3b exp/tri3b/graph ||  exit 1;
 
-    # decode test sets with tri3b models
     for x in native nonnative devtest test; do
+      echo "$0: decoding $x with tri3b models."
       steps/decode_fmllr.sh \
         --nj 8 --cmd "$decode_cmd"  exp/tri3b/graph data/$x exp/tri3b/decode_${x}
     done
   ) &
 fi
 
-if [ $stage -le 9 ]; then
-  # train and test chain models
+if [ $stage -le 12 ]; then
+  echo "$0: train and test chain models."
   local/chain/run_tdnn.sh
 fi
 
diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh
index 294e41cbc85..6df93e739f4 100755
--- a/egs/iam/v1/run_end2end.sh
+++ b/egs/iam/v1/run_end2end.sh
@@ -14,7 +14,7 @@ iam_database=/export/corpora5/handwriting_ocr/IAM
 # wellington_database points to the database path on the JHU grid. The Wellington
 # corpus contains two directories WWC and WSC (Wellington Written and Spoken Corpus).
 # This corpus is of written NZ English that can be purchased here:
-# "https://www.victoria.ac.nz/lals/resources/corpora-default" 
+# "https://www.victoria.ac.nz/lals/resources/corpora-default"
 wellington_database=/export/corpora5/Wellington/WWC/
 
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
@@ -90,7 +90,8 @@ fi
 if [ $stage -le 5 ]; then
   echo "$0: Aligning the training data using the e2e chain model..."
   steps/nnet3/align.sh --nj 50 --cmd "$cmd" \
-                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                       --use-gpu false \
+                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
                        data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
diff --git a/egs/iam/v2/cmd.sh b/egs/iam/v2/cmd.sh
new file mode 100644
index 00000000000..3c8eb9f93a5
--- /dev/null
+++ b/egs/iam/v2/cmd.sh
@@ -0,0 +1,13 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export cmd="queue.pl"
diff --git a/egs/iam/v2/image b/egs/iam/v2/image
new file mode 120000
index 00000000000..1668ee99922
--- /dev/null
+++ b/egs/iam/v2/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
\ No newline at end of file
diff --git a/egs/iam/v2/local/chain/compare_wer.sh b/egs/iam/v2/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..d4076457463
--- /dev/null
+++ b/egs/iam/v2/local/chain/compare_wer.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+. ./path.sh
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# WER (rescored)             "
+for x in $*; do
+  wer="--"
+  [ -d $x/decode_test_rescored ] && wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+echo -n "# CER (rescored)             "
+for x in $*; do
+  cer="--"
+  [ -d $x/decode_test_rescored ] && cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Parameters                 "
+for x in $*; do
+  params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}')
+  printf "% 10s" $params
+done
+echo
diff --git a/egs/iam/v2/local/chain/run_cnn_e2eali.sh b/egs/iam/v2/local/chain/run_cnn_e2eali.sh
new file mode 120000
index 00000000000..ad51803ab0e
--- /dev/null
+++ b/egs/iam/v2/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1c.sh
\ No newline at end of file
diff --git a/egs/iam/v2/local/chain/run_e2e_cnn.sh b/egs/iam/v2/local/chain/run_e2e_cnn.sh
new file mode 100755
index 00000000000..15bdf610cd3
--- /dev/null
+++ b/egs/iam/v2/local/chain/run_e2e_cnn.sh
@@ -0,0 +1,174 @@
+#!/bin/bash
+# Copyright    2017  Hossein Hadian
+
+# This script does end2end chain training (i.e. from scratch)
+
+# local/chain/compare_wer.sh exp/chain/cnn_1a exp/chain/cnn_chainali_1c exp/chain/e2e_cnn_1a
+# System                         cnn_1a cnn_chainali_1c e2e_cnn_1a
+# WER                             18.52     12.72     12.15
+# CER                             10.07      5.99      6.03
+# Final train prob              -0.0077   -0.0291   -0.0371
+# Final valid prob              -0.0970   -0.0359   -0.0636
+# Final train prob (xent)       -0.5484   -0.9781
+# Final valid prob (xent)       -0.9643   -1.1544
+# Parameters                      4.36M     3.96M     9.13M
+
+# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a
+# exp/chain/e2e_cnn_1a: num-iters=21 nj=2..4 num-params=9.1M dim=40->12640 combine=-0.033->-0.033 (over 1) logprob:train/valid[13,20,final]=(-0.058,-0.042,-0.035/-0.070,-0.064,-0.059)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+tdnn_dim=450
+num_epochs=4
+num_jobs_initial=2
+num_jobs_final=4
+minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8
+common_egs_dir=
+l2_regularize=0.00005
+frames_per_iter=1000000
+cmvn_opts="--norm-means=true --norm-vars=true"
+train_set=train
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e
+treedir=exp/chain/e2e_bitree  # it's actually just a trivial tree (no tree building)
+dir=exp/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type biphone \
+                                       data/$train_set $lang $treedir
+  $cmd $treedir/log/make_phone_lm.log \
+  cat data/$train_set/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
+    utils/sym2int.pl -f 2- data/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=500 \
+                       ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize $l2_regularize \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter $frames_per_iter \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj 30 --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test $dir/decode_test{,_rescored} || exit 1
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh
new file mode 100755
index 00000000000..ba28f681708
--- /dev/null
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -0,0 +1,245 @@
+#!/bin/bash
+
+# e2eali_1a is the same as chainali_1c but uses the e2e chain model to get the
+# lattice alignments and to build a tree
+
+# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a exp/chain/cnn_chainali_1c exp/chain/cnn_e2eali_1a
+# System                      e2e_cnn_1a cnn_chainali_1c cnn_e2eali_1a
+# WER                             13.87     12.72     12.70
+# CER                              6.54      5.99      5.75
+# Final train prob              -0.0371   -0.0291   -0.0557
+# Final valid prob              -0.0636   -0.0359   -0.0770
+# Final train prob (xent)                 -0.9781   -0.8847
+# Final valid prob (xent)                 -1.1544   -1.0370
+# Parameters                      9.13M     3.96M     3.95M
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a
+# exp/chain/cnn_e2eali_1a: num-iters=21 nj=2..4 num-params=4.0M dim=40->360 combine=-0.056->-0.056 (over 1) xent:train/valid[13,20,final]=(-1.47,-0.978,-0.918/-1.54,-1.10,-1.06) logprob:train/valid[13,20,final]=(-0.106,-0.065,-0.056/-0.113,-0.086,-0.079)
+
+set -e -o pipefail
+
+stage=0
+
+nj=30
+train_set=train
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+tdnn_dim=450
+# training options
+srand=0
+remove_egs=true
+lang_test=lang_unk
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=500" \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=1000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/$lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $chunk_left_context \
+    --extra-right-context $chunk_right_context \
+    --extra-left-context-initial 0 \
+    --extra-right-context-final 0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+fi
diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh
new file mode 100755
index 00000000000..298e7053086
--- /dev/null
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -0,0 +1,251 @@
+#!/bin/bash
+
+# e2eali_1b is the same as e2eali_1a but uses unconstrained egs
+
+# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a exp/chain/cnn_e2eali_1b
+# System                      cnn_e2eali_1a cnn_e2eali_1b
+# WER                             10.40     10.33
+# WER (rescored)                  10.02     10.10
+# CER                              4.97      5.00
+# CER (rescored)                   4.83      4.88
+# Final train prob              -0.0612   -0.0428
+# Final valid prob              -0.0857   -0.0666
+# Final train prob (xent)       -0.8990   -0.9210
+# Final valid prob (xent)       -1.0024   -1.0264
+# Parameters                      3.98M     3.98M
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b
+# exp/chain/cnn_e2eali_1b: num-iters=21 nj=2..4 num-params=4.0M dim=40->360 combine=-0.038->-0.038 (over 1) xent:train/valid[13,20,final]=(-1.34,-0.967,-0.838/-1.40,-1.07,-0.985) logprob:train/valid[13,20,final]=(-0.075,-0.054,-0.037/-0.083,-0.072,-0.059)
+
+set -e -o pipefail
+
+stage=0
+
+nj=30
+train_set=train
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1b  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+tdnn_dim=450
+# training options
+srand=0
+remove_egs=true
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=500" \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=1000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $chunk_left_context \
+    --extra-right-context $chunk_right_context \
+    --extra-left-context-initial 0 \
+    --extra-right-context-final 0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test $dir/decode_test{,_rescored} || exit 1
+fi
diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh
new file mode 100755
index 00000000000..ef851c8ae2f
--- /dev/null
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh
@@ -0,0 +1,253 @@
+#!/bin/bash
+
+# e2eali_1c is the same as e2eali_1b but has fewer CNN layers, smaller
+# l2-regularize, more epochs and uses dropout.
+
+
+# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b exp/chain/cnn_e2eali_1c
+# System                      cnn_e2eali_1b cnn_e2eali_1c
+# WER                             10.33     10.05
+# WER (rescored)                  10.10      9.75
+# CER                              5.00      4.76
+# CER (rescored)                   4.88      4.68
+# Final train prob              -0.0428   -0.0317
+# Final valid prob              -0.0666   -0.0630
+# Final train prob (xent)       -0.9210   -0.5413
+# Final valid prob (xent)       -1.0264   -0.7096
+# Parameters                      3.98M     5.12M
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1c
+# exp/chain/cnn_e2eali_1c: num-iters=21 nj=2..4 num-params=5.1M dim=40->392 combine=-0.034->-0.034 (over 1) xent:train/valid[13,20,final]=(-0.953,-0.800,-0.541/-1.03,-0.933,-0.710) logprob:train/valid[13,20,final]=(-0.069,-0.048,-0.032/-0.091,-0.078,-0.063)
+
+set -e -o pipefail
+
+stage=0
+
+nj=30
+train_set=train
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1b6  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+tdnn_dim=550
+# training options
+srand=0
+remove_egs=true
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
+  tdnn_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.04"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=500" \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=8 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $chunk_left_context \
+    --extra-right-context $chunk_right_context \
+    --extra-left-context-initial 0 \
+    --extra-right-context-final 0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test $dir/decode_test{,_rescored} || exit 1
+fi
diff --git a/egs/iam/v2/local/check_tools.sh b/egs/iam/v2/local/check_tools.sh
new file mode 100755
index 00000000000..5b4d3107d3b
--- /dev/null
+++ b/egs/iam/v2/local/check_tools.sh
@@ -0,0 +1,43 @@
+#!/bin/bash -u
+
+# Copyright 2015 (c) Johns Hopkins University (Jan Trmal <jtrmal@gmail.com>)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+[ -f ./path.sh ] && . ./path.sh
+set +e
+
+command -v python3 >&/dev/null \
+  || { echo  >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; }
+
+python3 -c "import numpy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs numpy installed."
+  exit 1
+fi
+
+python3 -c "import scipy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy installed."
+  exit 1
+fi
+
+python3 -c "import scipy.misc; scipy.misc.__dict__['imread']"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy-image and  Pillow installed."
+  exit 1
+fi
+
+
+exit  0
diff --git a/egs/iam/v2/local/make_features.py b/egs/iam/v2/local/make_features.py
new file mode 100755
index 00000000000..84e012daedb
--- /dev/null
+++ b/egs/iam/v2/local/make_features.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+#                2018  Hossein Hadian
+
+""" This script converts images to Kaldi-format feature matrices. The input to
+    this script is the path to a data directory, e.g. "data/train". This script
+    reads the images listed in images.scp and writes them to standard output
+    (by default) as Kaldi-formatted matrices (in text form). It also scales the
+    images so they have the same height (via --feat-dim). It can optionally pad
+    the images (on left/right sides) with white pixels.
+    If an 'image2num_frames' file is found in the data dir, it will be used
+    to enforce the images to have the specified length in that file by padding
+    white pixels (the --padding option will be ignored in this case). This relates
+    to end2end chain training.
+
+    eg. local/make_features.py data/train --feat-dim 40
+"""
+
+import argparse
+import os
+import sys
+import numpy as np
+from scipy import misc
+
+parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and
+                                                writes them to standard output in text format.""")
+parser.add_argument('dir', type=str,
+                    help='Source data directory (containing images.scp)')
+parser.add_argument('--out-ark', type=str, default='-',
+                    help='Where to write the output feature file')
+parser.add_argument('--feat-dim', type=int, default=40,
+                    help='Size to scale the height of all images')
+parser.add_argument('--padding', type=int, default=5,
+                    help='Number of white pixels to pad on the left'
+                    'and right side of the image.')
+
+
+args = parser.parse_args()
+
+
+def write_kaldi_matrix(file_handle, matrix, key):
+    file_handle.write(key + " [ ")
+    num_rows = len(matrix)
+    if num_rows == 0:
+        raise Exception("Matrix is empty")
+    num_cols = len(matrix[0])
+
+    for row_index in range(len(matrix)):
+        if num_cols != len(matrix[row_index]):
+            raise Exception("All the rows of a matrix are expected to "
+                            "have the same length")
+        file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index])))
+        if row_index != num_rows - 1:
+            file_handle.write("\n")
+    file_handle.write(" ]\n")
+
+def get_scaled_image(im, allowed_lengths = None):
+    scale_size = args.feat_dim
+    sx = im.shape[1]
+    sy = im.shape[0]
+    scale = (1.0 * scale_size) / sy
+    nx = int(scale_size)
+    ny = int(scale * sx)
+    im = misc.imresize(im, (nx, ny))
+    if allowed_lengths is None:
+        left_padding = right_padding = args.padding
+    else:  # Find an allowed length for the image
+        imlen = im.shape[1]
+        allowed_len = 0
+        for l in allowed_lengths:
+            if l > imlen:
+                allowed_len = l
+                break
+        if allowed_len == 0:
+            #  No allowed length was found for the image (the image is too long)
+            return None
+        padding = allowed_len - imlen
+        left_padding = padding // 2
+        right_padding = padding - left_padding
+    dim_y = im.shape[0]
+    im_pad = np.concatenate((255 * np.ones((dim_y, left_padding),
+                                           dtype=int), im), axis=1)
+    im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding),
+                                                    dtype=int)), axis=1)
+    return im_pad1
+
+### main ###
+data_list_path = os.path.join(args.dir, 'images.scp')
+
+if args.out_ark == '-':
+    out_fh = sys.stdout
+else:
+    out_fh = open(args.out_ark,'wb')
+
+allowed_lengths = None
+if os.path.isfile(os.path.join(args.dir, 'allowed_lengths.txt')):
+    print("Found 'allowed_lengths.txt' file...", file=sys.stderr)
+    allowed_lengths = []
+    with open(os.path.join(args.dir,'allowed_lengths.txt')) as f:
+        for line in f:
+            allowed_lengths.append(int(line.strip()))
+    print("Read {} allowed lengths and will apply them to the "
+          "features.".format(len(allowed_lengths)), file=sys.stderr)
+
+num_fail = 0
+num_ok = 0
+with open(data_list_path) as f:
+    for line in f:
+        line = line.strip()
+        line_vect = line.split(' ')
+        image_id = line_vect[0]
+        image_path = line_vect[1]
+        im = misc.imread(image_path)
+        im_scaled = get_scaled_image(im, allowed_lengths)
+
+        if im_scaled is None:
+            num_fail += 1
+            continue
+        data = np.transpose(im_scaled, (1, 0))
+        data = np.divide(data, 255.0)
+        num_ok += 1
+        write_kaldi_matrix(out_fh, data, image_id)
+
+print('Generated features for {} images. Failed for {} (iamge too '
+      'long).'.format(num_ok, num_fail), file=sys.stderr)
diff --git a/egs/iam/v2/local/prepare_data.sh b/egs/iam/v2/local/prepare_data.sh
new file mode 100755
index 00000000000..73d711c73f0
--- /dev/null
+++ b/egs/iam/v2/local/prepare_data.sh
@@ -0,0 +1,170 @@
+#!/bin/bash
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+#                2017  Hossein Hadian
+# Apache 2.0
+
+# This script downloads the IAM handwriting database and prepares the training
+# and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py.
+# It also downloads the LOB and Brown text corpora. It downloads the database files
+# only if they do not already exist in download directory.
+
+#  Eg. local/prepare_data.sh
+#  Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from
+#      utt2spk file: 000_a01-000u-00 000
+#      images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png
+#      spk2utt file: 000 000_a01-000u-00 000_a01-000u-01 000_a01-000u-02 000_a01-000u-03
+
+stage=0
+download_dir=data/download
+wellington_dir=
+username=
+password=       # username and password for downloading the IAM database
+                # if you have not already downloaded the database, please
+                # register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database
+                # and provide this script with your username and password.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+if [[ ! -f $download_dir/lines.tgz && -z $username ]]; then
+  echo "$0: Warning: Couldn't find lines.tgz in $download_dir. Unless the extracted dataset files"
+  echo "exist in your data/local directory this script will fail because the required files"
+  echo "can't be downloaded automatically (it needs registration)."
+  echo "Please register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database"
+  echo "... and then call this script again with --username <username> --password <password>"
+  echo ""
+  exit 1
+fi
+
+lines=data/local/lines
+xml=data/local/xml
+ascii=data/local/ascii
+bcorpus=data/local/browncorpus
+lobcorpus=data/local/lobcorpus
+wcorpus=data/local/wellingtoncorpus
+data_split_info=data/local/largeWriterIndependentTextLineRecognitionTask
+lines_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/lines/lines.tgz
+xml_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/xml/xml.tgz
+data_split_info_url=http://www.fki.inf.unibe.ch/DBs/iamDB/tasks/largeWriterIndependentTextLineRecognitionTask.zip
+ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz
+brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt
+lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip
+wellington_corpus_loc=/export/corpora5/Wellington/WWC/
+mkdir -p $download_dir data/local
+
+# download and extact images and transcription
+if [ -d $lines ]; then
+  echo "$0: Not downloading lines images as it is already there."
+else
+  if [ ! -f $download_dir/lines.tgz ]; then
+    echo "$0: Trying to download lines images..."
+    wget -P $download_dir --user "$username" --password "$password" $lines_url || exit 1;
+  fi
+  mkdir -p $lines
+  tar -xzf $download_dir/lines.tgz -C $lines || exit 1;
+  echo "$0: Done downloading and extracting lines images"
+fi
+
+if [ -d $xml ]; then
+  echo "$0: Not downloading transcriptions as it is already there."
+else
+  if [ ! -f $download_dir/xml.tgz ]; then
+    echo "$0: Trying to download transcriptions..."
+    wget -P $download_dir --user "$username" --password "$password" $xml_url || exit 1;
+  fi
+  mkdir -p $xml
+  tar -xzf $download_dir/xml.tgz -C $xml || exit 1;
+  echo "$0: Done downloading and extracting transcriptions."
+fi
+
+if [ -d $data_split_info ]; then
+  echo "$0: Not downloading data split information as it is already there."
+else
+  if [ ! -f $download_dir/largeWriterIndependentTextLineRecognitionTask.zip ]; then
+    echo "$0: Trying to download training and testing data split information..."
+    wget -P $download_dir --user "$username" --password "$password" $data_split_info_url || exit 1;
+  fi
+  mkdir -p $data_split_info
+  unzip $download_dir/largeWriterIndependentTextLineRecognitionTask.zip -d $data_split_info || exit 1;
+  echo "$0: Done downloading and extracting training and testing data split information"
+fi
+
+if [ -d $ascii ]; then
+  echo "$0: Not downloading ascii.tgz as it is already there."
+else
+  if [ ! -f $download_dir/ascii.tgz ]; then
+    echo "$0: trying to download ascii.tgz..."
+    wget -P $download_dir --user "$username" --password "$password" $ascii_url || exit 1;
+  fi
+  mkdir -p $ascii
+  tar -xzf $download_dir/ascii.tgz -C $ascii || exit 1;
+  echo "$0: Done downloading and extracting ascii.tgz"
+fi
+
+if [ -d $lobcorpus ]; then
+  echo "$0: Not downloading the LOB text corpus as it is already there."
+else
+  if [ ! -f $lobcorpus/0167.zip ]; then
+    echo "$0: Downloading the LOB text corpus ..."
+    mkdir -p $lobcorpus
+    wget -P $lobcorpus/ $lob_corpus_url || exit 1;
+  fi
+  unzip $lobcorpus/0167.zip -d $lobcorpus || exit 1;
+  echo "$0: Done downloading and extracting LOB corpus"
+fi
+
+if [ -d $bcorpus ]; then
+  echo "$0: Not downloading the Brown corpus as it is already there."
+else
+  if [ ! -f $bcorpus/brown.txt ]; then
+    mkdir -p $bcorpus
+    echo "$0: Downloading the Brown text corpus..."
+    wget -P $bcorpus $brown_corpus_url || exit 1;
+  fi
+  echo "$0: Done downloading the Brown text corpus"
+fi
+
+if [ -d $wcorpus ]; then
+  echo "$0: Not copying Wellington corpus as it is already there."
+elif [ ! -z $wellington_dir ]; then
+  mkdir -p $wcorpus
+  cp -r $wellington_dir/. $wcorpus
+
+  # Combine Wellington corpora and replace some of their annotations
+  cat data/local/wellingtoncorpus/Section{A,B,C,D,E,F,G,H,J,K,L}.txt | \
+    cut -d' ' -f3- | sed "s/^[ \t]*//" > data/local/wellingtoncorpus/Wellington_annotated.txt
+
+  cat data/local/wellingtoncorpus/Wellington_annotated.txt | local/remove_wellington_annotations.py > data/local/wellingtoncorpus/Wellington_annotation_removed.txt
+
+  echo "$0: Done copying Wellington corpus"
+else
+  echo "$0: Wellington Corpus not included because wellington_dir not provided"
+fi
+
+mkdir -p data/{train,test,val}
+file_name=largeWriterIndependentTextLineRecognitionTask
+
+train_old="data/local/$file_name/trainset.txt"
+test_old="data/local/$file_name/testset.txt"
+val1_old="data/local/$file_name/validationset1.txt"
+val2_old="data/local/$file_name/validationset2.txt"
+
+train_new="data/local/train.uttlist"
+test_new="data/local/test.uttlist"
+val_new="data/local/validation.uttlist"
+
+cat $train_old > $train_new
+cat $test_old > $test_new
+cat $val1_old $val2_old > $val_new
+
+if [ $stage -le 0 ]; then
+  local/process_data.py data/local data/train --dataset train || exit 1
+  local/process_data.py data/local data/test --dataset test || exit 1
+  local/process_data.py data/local data/val --dataset validation || exit 1
+
+  utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
+  utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
+fi
diff --git a/egs/iam/v2/local/prepare_dict.sh b/egs/iam/v2/local/prepare_dict.sh
new file mode 100755
index 00000000000..e21a59c7e92
--- /dev/null
+++ b/egs/iam/v2/local/prepare_dict.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+
+# Copyright      2017  Hossein Hadian
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+vocab_size=50000
+. ./utils/parse_options.sh
+
+mkdir -p $dir
+
+# First get the set of all letters that occur in data/train/text
+cat data/train/text | \
+  perl -ne '@A = split; shift @A; for(@A) {print join("\n", split(//)), "\n";}' | \
+  sort -u | grep -v "|" > $dir/nonsilence_phones.txt
+
+# Now use the pocolm's wordlist which is the most N frequent words in
+# in data/train/text and LOB+Brown corpora (dev and test excluded) with their comprising
+# letters as their transcription. Only include words that use the above letters.
+# (Letter # is replaced with <HASH>)
+
+export letters=$(cat $dir/nonsilence_phones.txt | tr -d "\n")
+
+head -n $vocab_size data/local/local_lm/data/word_count | awk '{print $2}' | \
+  perl -e '$letters=$ENV{letters}; $letters=$letters . "|";
+while(<>){
+    chop;
+    $w = $_;
+    if($w =~ m/^[$letters]+$/){
+      $trans = join(" ", split(//, $w));
+      $trans =~ s/#/<HASH>/g;
+      $trans =~ s/\|/SIL/g;
+      print "$w $trans\n";
+    }
+}' | sort -u > $dir/lexicon.txt
+
+
+sed -i "s/#/<HASH>/" $dir/nonsilence_phones.txt
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
diff --git a/egs/iam/v2/local/prepend_words.py b/egs/iam/v2/local/prepend_words.py
new file mode 100755
index 00000000000..d53eb8974bf
--- /dev/null
+++ b/egs/iam/v2/local/prepend_words.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# This script, prepend '|' to every words in the transcript to mark
+# the beginning of the words for finding the initial-space of every word
+# after decoding.
+
+import sys, io
+
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+for line in infile:
+    output.write(' '.join(["|" + word for word in line.split()]) + '\n')
diff --git a/egs/iam/v2/local/process_data.py b/egs/iam/v2/local/process_data.py
new file mode 100755
index 00000000000..fa5eb484707
--- /dev/null
+++ b/egs/iam/v2/local/process_data.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+""" This script reads the extracted IAM database files and creates
+    the following files (for the data subset selected via --dataset):
+    text, utt2spk, images.scp.
+
+  Eg. local/process_data.py data/local data/train data --dataset train
+  Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from
+      utt2spk file: 000_a01-000u-00 000
+      images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png
+"""
+
+import argparse
+import os
+import sys
+import xml.dom.minidom as minidom
+
+parser = argparse.ArgumentParser(description="""Creates text, utt2spk
+                                                and images.scp files.""")
+parser.add_argument('database_path', type=str,
+                    help='Path to the downloaded (and extracted) IAM data')
+parser.add_argument('out_dir', type=str,
+                    help='Where to write output files.')
+parser.add_argument('--dataset', type=str, default='train',
+                    choices=['train', 'test','validation'],
+                    help='Subset of data to process.')
+args = parser.parse_args()
+
+text_file = os.path.join(args.out_dir + '/', 'text')
+text_fh = open(text_file, 'w')
+
+utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w')
+
+image_file = os.path.join(args.out_dir + '/', 'images.scp')
+image_fh = open(image_file, 'w')
+
+dataset_path = os.path.join(args.database_path,
+                            args.dataset + '.uttlist')
+
+text_file_path = os.path.join(args.database_path,
+                              'ascii','lines.txt')
+text_dict = {}
+def process_text_file_for_word_model():
+  with open (text_file_path, 'rt') as in_file:
+    for line in in_file:
+      if line[0]=='#':
+        continue
+      line = line.strip()
+      utt_id = line.split(' ')[0]
+      text_vect = line.split(' ')[8:]
+      text = "".join(text_vect)
+      text = text.replace("|", " ")
+      text_dict[utt_id] = text
+
+print("Processing '{}' data...".format(args.dataset))
+process_text_file_for_word_model()
+
+with open(dataset_path) as f:
+  for line in f:
+    line = line.strip()
+    line_vect = line.split('-')
+    xml_file = line_vect[0] + '-' + line_vect[1]
+    xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml')
+    img_num = line[-3:]
+    doc = minidom.parse(xml_path)
+
+    form_elements = doc.getElementsByTagName('form')[0]
+    writer_id = form_elements.getAttribute('writer-id')
+    outerfolder = form_elements.getAttribute('id')[0:3]
+    innerfolder = form_elements.getAttribute('id')
+    lines_path = os.path.join(args.database_path, 'lines',
+                              outerfolder, innerfolder, innerfolder)
+    image_file_path = lines_path + img_num + '.png'
+    text =  text_dict[line]
+    utt_id = writer_id + '_' + line
+    text_fh.write(utt_id + ' ' + text + '\n')
+    utt2spk_fh.write(utt_id + ' ' + writer_id + '\n')
+    image_fh.write(utt_id + ' ' + image_file_path + '\n')
diff --git a/egs/iam/v2/local/remove_test_utterances_from_lob.py b/egs/iam/v2/local/remove_test_utterances_from_lob.py
new file mode 100755
index 00000000000..1b414ef47f6
--- /dev/null
+++ b/egs/iam/v2/local/remove_test_utterances_from_lob.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+# Copyright   2018 Ashish Arora
+
+import argparse
+import os
+import numpy as np
+import sys
+import re
+
+parser = argparse.ArgumentParser(description="""Removes dev/test set lines
+                                                from the LOB corpus. Reads the
+                                                corpus from stdin, and writes it to stdout.""")
+parser.add_argument('dev_text', type=str,
+                    help='dev transcription location.')
+parser.add_argument('test_text', type=str,
+                    help='test transcription location.')
+args = parser.parse_args()
+
+def remove_punctuations(transcript):
+    char_list = []
+    for char in transcript:
+        if char.isdigit() or char == '+' or char == '~' or char == '?':
+            continue
+        if char == '#' or char == '=' or char == '-' or char == '!':
+            continue
+        if char == ',' or char == '.' or char == ')' or char == '\'':
+            continue
+        if char == '(' or char == ':' or char == ';' or char == '"':
+            continue
+        char_list.append(char)
+    return char_list
+
+
+def remove_special_words(words):
+    word_list = []
+    for word in words:
+        if word == '<SIC>' or word == '#':
+            continue
+        word_list.append(word)
+    return word_list
+
+
+# process and add dev/eval transcript in a list
+# remove special words, punctuations, spaces between words
+# lowercase the characters
+def read_utterances(text_file_path):
+    with open(text_file_path, 'rt') as in_file:
+        for line in in_file:
+            words = line.strip().split()
+            words_wo_sw = remove_special_words(words)
+            transcript = ''.join(words_wo_sw[1:])
+            transcript = transcript.lower()
+            trans_wo_punct = remove_punctuations(transcript)
+            transcript = ''.join(trans_wo_punct)
+            utterance_dict[words_wo_sw[0]] = transcript
+
+
+### main ###
+
+# read utterances and add it to utterance_dict
+utterance_dict = dict()
+read_utterances(args.dev_text)
+read_utterances(args.test_text)
+
+# read corpus and add it to below lists
+corpus_text_lowercase_wo_sc = list()
+corpus_text_wo_sc = list()
+original_corpus_text = list()
+for line in sys.stdin:
+    original_corpus_text.append(line)
+    words = line.strip().split()
+    words_wo_sw = remove_special_words(words)
+
+    transcript = ''.join(words_wo_sw)
+    transcript = transcript.lower()
+    trans_wo_punct = remove_punctuations(transcript)
+    transcript = ''.join(trans_wo_punct)
+    corpus_text_lowercase_wo_sc.append(transcript)
+
+    transcript = ''.join(words_wo_sw)
+    trans_wo_punct = remove_punctuations(transcript)
+    transcript = ''.join(trans_wo_punct)
+    corpus_text_wo_sc.append(transcript)
+
+# find majority of utterances below
+# for utterances which were not found
+# add them to remaining_utterances
+row_to_keep = [True for i in range(len(original_corpus_text))]
+remaining_utterances = dict()
+for line_id, line_to_find in utterance_dict.items():
+    found_line = False
+    for i in range(1, (len(corpus_text_lowercase_wo_sc) - 2)):
+        # Combine 3 consecutive lines of the corpus into a single line
+        prev_words = corpus_text_lowercase_wo_sc[i - 1].strip()
+        curr_words = corpus_text_lowercase_wo_sc[i].strip()
+        next_words = corpus_text_lowercase_wo_sc[i + 1].strip()
+        new_line = prev_words + curr_words + next_words
+        transcript = ''.join(new_line)
+        if line_to_find in transcript:
+            found_line = True
+            row_to_keep[i-1] = False
+            row_to_keep[i] = False
+            row_to_keep[i+1] = False
+    if not found_line:
+        remaining_utterances[line_id] = line_to_find
+
+
+for i in range(len(original_corpus_text)):
+    transcript = original_corpus_text[i].strip()
+    if row_to_keep[i]:
+        print(transcript)
+
+print('Sentences not removed from LOB: {}'.format(remaining_utterances), file=sys.stderr)
+print('Total test+dev sentences: {}'.format(len(utterance_dict)), file=sys.stderr)
+print('Number of sentences not removed from LOB: {}'. format(len(remaining_utterances)), file=sys.stderr)
+print('LOB lines: Before: {}   After: {}'.format(len(original_corpus_text),
+                                                 row_to_keep.count(True)), file=sys.stderr)
diff --git a/egs/iam/v2/local/remove_wellington_annotations.py b/egs/iam/v2/local/remove_wellington_annotations.py
new file mode 100755
index 00000000000..260a3542985
--- /dev/null
+++ b/egs/iam/v2/local/remove_wellington_annotations.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+# Copyright 2018 Chun-Chieh Chang
+
+import sys
+import io
+import re
+from collections import OrderedDict
+
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8");
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8");
+
+prev2_line = " ";
+prev_line = " ";
+for line in sys.stdin:
+    line = line.strip()
+    pattern = re.compile("\\*\\*\\[.*?\\*\\*\\]|\\*[0-9]|\\\\[0-9]{0,2}|\\*\\*?[\|,\?,\#,\=,\;,\:,\<,\>]|\||\^")
+    line_fixed = pattern.sub("", line)
+    dict=OrderedDict([("*+$","$"), ("*+","£"), ("*-","-"), ("*/","*"), ("*{","{"), ("*}","}"),
+        ("**\"","\""), ("*\"","\""), ("**'","'"), ("*'","'"), ("*@","°")])
+    pattern = re.compile("|".join(re.escape(key) for key in dict.keys()));
+    line_fixed = pattern.sub(lambda x: dict[x.group()], line_fixed)
+    
+    line_fixed = prev2_line + "\n" + prev_line + "\n" + line_fixed
+
+    pattern = re.compile("\{[0-9]{0,2}(.*?)\}", re.DOTALL)
+    line_fixed = pattern.sub(lambda x: x.group(1), line_fixed)
+
+    output, prev2_line, prev_line = line_fixed.split("\n")
+
+    sys.stdout.write(output + "\n")
+sys.stdout.write(prev2_line + "\n")
+sys.stdout.write(prev_line + "\n")
diff --git a/egs/iam/v2/local/score.sh b/egs/iam/v2/local/score.sh
new file mode 100755
index 00000000000..b2032909333
--- /dev/null
+++ b/egs/iam/v2/local/score.sh
@@ -0,0 +1,155 @@
+#!/bin/bash
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
+# Apache 2.0
+
+# This script is like steps/scoring/score_kaldi_wer.sh except it transcribes the <unk>'s
+# using local/unk_arc_post_to_transcription.py and also it calls
+# steps/scoring/score_kaldi_cer.sh at the end.
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+decode_mbr=false
+stats=true
+beam=6
+word_ins_penalty=0.0,0.5,1.0
+min_lmwt=3
+max_lmwt=13
+iter=final
+#end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+model_path=`echo $dir |xargs dirname`
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+
+ref_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter"
+hyp_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
+
+
+if $decode_mbr ; then
+  echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty"
+else
+  echo "$0: scoring with word insertion penalty=$word_ins_penalty"
+fi
+
+
+mkdir -p $dir/scoring_kaldi
+cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1;
+if [ $stage -le 0 ]; then
+
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    mkdir -p $dir/scoring_kaldi/penalty_$wip/log
+
+    if $decode_mbr ; then
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+        acwt=\`perl -e \"print 1.0/LMWT\"\`\; \
+        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-prune --beam=$beam ark:- ark:- \| \
+        lattice-mbr-decode  --word-symbol-table=$symtab \
+        ark:- ark,t:- \| \
+        utils/int2sym.pl -f 2- $symtab \| \
+        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
+
+    else
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \
+        utils/int2sym.pl -f 2- $symtab \| \
+        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
+    fi
+
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \
+      cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \
+      compute-wer --text --mode=present \
+      "ark:cat $dir/scoring_kaldi/test_filt.txt |" ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;
+
+  done
+fi
+
+
+
+if [ $stage -le 1 ]; then
+
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    for lmwt in $(seq $min_lmwt $max_lmwt); do
+      # adding /dev/null to the command list below forces grep to output the filename
+      grep WER $dir/wer_${lmwt}_${wip} /dev/null
+    done
+  done | utils/best_wer.sh  >& $dir/scoring_kaldi/best_wer || exit 1
+
+  best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer)
+  best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}')
+  best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}')
+
+  if [ -z "$best_lmwt" ]; then
+    echo "$0: we could not get the details of the best WER from the file $dir/wer_*.  Probably something went wrong."
+    exit 1;
+  fi
+
+  if $stats; then
+    mkdir -p $dir/scoring_kaldi/wer_details
+    echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight
+    echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty
+
+    $cmd $dir/scoring_kaldi/log/stats1.log \
+      cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \
+      align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \|  \
+      utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\
+       utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/stats2.log \
+      cat $dir/scoring_kaldi/wer_details/per_utt \| \
+      utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
+      sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/wer_bootci.log \
+      compute-wer-bootci --mode=present \
+        ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
+        '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;
+
+  fi
+fi
+
+steps/scoring/score_kaldi_cer.sh --cmd "$cmd" --stage 2 --min-lmwt $min_lmwt \
+                                 --max-lmwt $max_lmwt --word-ins-penalty $word_ins_penalty \
+                                 $data $lang_or_graph $dir
+
+# If we got here, the scoring was successful.
+# As a  small aid to prevent confusion, we remove all wer_{?,??} files;
+# these originate from the previous version of the scoring files
+# i keep both statement here because it could lead to confusion about
+# the capabilities of the script (we don't do cer in the script)
+rm $dir/wer_{?,??} 2>/dev/null
+rm $dir/cer_{?,??} 2>/dev/null
+
+exit 0;
diff --git a/egs/iam/v2/local/train_lm.sh b/egs/iam/v2/local/train_lm.sh
new file mode 100755
index 00000000000..35eb56b1341
--- /dev/null
+++ b/egs/iam/v2/local/train_lm.sh
@@ -0,0 +1,156 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains an LM on the LOB+Brown text data and IAM training transcriptions.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+vocab_size=50000
+
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+dir=data/local/local_lm
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt=
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # Using LOB and brown corpus.
+  if [ ! -f data/local/lob-train-only.txt ]; then
+    cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \
+      local/remove_test_utterances_from_lob.py data/test/text.old data/val/text.old \
+                                               > data/local/lob-train-only.txt
+  fi
+  cat data/local/lob-train-only.txt | \
+    local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
+    | sed 's/@@//g' > ${dir}/data/text/lob.txt
+  cat data/local/browncorpus/brown.txt | \
+    local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
+    | sed 's/@@//g' > ${dir}/data/text/brown.txt
+  if [ -d "data/local/wellingtoncorpus" ]; then
+    cat data/local/wellingtoncorpus/Wellington_annotation_removed.txt | \
+      local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
+      | sed 's/@@//g' > ${dir}/data/text/wellington.txt
+  fi
+
+  # use the validation data as the dev set.
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+
+  cat data/val/text | cut -d " " -f 2-  > ${dir}/data/text/dev.txt
+
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  cat data/train/text | cut -d " " -f 2- >  ${dir}/data/text/iam.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from IAM text
+  if [ -d "data/local/wellingtoncorpus" ]; then
+    cat ${dir}/data/text/{iam,lob,brown,wellington}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  else
+    echo "$0: Wellington Corpus not found. Proceeding without using that corpus."
+    cat ${dir}/data/text/{iam,lob,brown}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  fi
+  head -n $vocab_size ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+order=6
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='brown=2 lob=2 iam=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+
+  train_lm.py  --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20  \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: pruning the LM (to larger size)"
+  # Using 1 million n-grams for a big LM for rescoring purposes.
+  size=1000000
+  prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: pruning the LM (to smaller size)"
+  # Using 500,000 n-grams for a smaller LM for graph building.  Prune from the
+  # bigger-pruned LM, it'll be faster.
+  size=500000
+  prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
+
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
+fi
diff --git a/egs/iam/v2/local/wer_output_filter b/egs/iam/v2/local/wer_output_filter
new file mode 100755
index 00000000000..24691a160a9
--- /dev/null
+++ b/egs/iam/v2/local/wer_output_filter
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+# Copyright      2017  Hossein Hadian
+
+# This is a filter used in scoring. It separates all
+# punctuations from words. For e.g. this sentence:
+
+# "They have come!" he said reverently, gripping his
+# hands. "Isn't it a glorious thing! Long awaited."
+
+# is converted to this:
+
+# " They have come ! " he said reverently , gripping his
+# hands . " Isn ' t it a glorious thing ! Long awaited . "
+
+# Sample BPE-based output:
+# |He |ro se |from |his |b re ak f as t - s ch oo l |b en ch
+
+import sys
+import re
+
+punctuations = "!(),.?;:'-\""
+escaped_punctuations = re.escape(punctuations)
+
+for line in sys.stdin:
+  words = line.strip().split()
+  uttid = words[0]
+  transcript = ''.join(words[1:])
+  transcript = transcript.replace('|', ' ')
+  split_transcript = " ".join(re.split("([{}])".format(escaped_punctuations),
+                                       transcript)).strip()
+  print("{} {}".format(uttid, split_transcript))
diff --git a/egs/iam/v2/path.sh b/egs/iam/v2/path.sh
new file mode 100755
index 00000000000..7e458144624
--- /dev/null
+++ b/egs/iam/v2/path.sh
@@ -0,0 +1,9 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+
+export LD_LIBRARY_PATH=$KALDI_ROOT/tools/openfst/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/home/dpovey/libs:$LD_LIBRARY_PATH
+export LC_ALL=C
diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh
new file mode 100755
index 00000000000..de5c7086ec2
--- /dev/null
+++ b/egs/iam/v2/run_end2end.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+# Copyright 2017    Hossein Hadian
+
+set -e
+stage=0
+nj=20
+username=
+password=
+# iam_database points to the database path on the JHU grid. If you have not
+# already downloaded the database you can set it to a local directory
+# like "data/download" and follow the instructions
+# in "local/prepare_data.sh" to download the database:
+iam_database=/export/corpora5/handwriting_ocr/IAM
+# wellington_database points to the database path on the JHU grid. The Wellington
+# corpus contains two directories WWC and WSC (Wellington Written and Spoken Corpus).
+# This corpus is of written NZ English that can be purchased here:
+# "https://www.victoria.ac.nz/lals/resources/corpora-default"
+wellington_database=/export/corpora5/Wellington/WWC/
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+. ./utils/parse_options.sh  # e.g. this parses the above options
+                            # if supplied.
+
+
+./local/check_tools.sh
+
+if [ $stage -le 0 ]; then
+  echo "$0: Preparing data..."
+  local/prepare_data.sh --download-dir "$iam_database" \
+    --wellington-dir "$wellington_database" \
+    --username "$username" --password "$password"
+fi
+mkdir -p data/{train,test}/data
+
+if [ $stage -le 1 ]; then
+  image/get_image2num_frames.py data/train  # This will be needed for the next command
+  # The next command creates a "allowed_lengths.txt" file in data/train
+  # which will be used by local/make_features.py to enforce the images to
+  # have allowed lengths. The allowed lengths will be spaced by 10% difference in length.
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
+  echo "$0: Preparing the test and train feature files..."
+  for dataset in train test; do
+    local/make_features.py data/$dataset --feat-dim 40 | \
+      copy-feats --compress=true --compression-method=7 \
+                 ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp
+    steps/compute_cmvn_stats.sh data/$dataset
+  done
+  utils/fix_data_dir.sh data/train
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: Preparing BPE..."
+  cut -d' ' -f2- data/train/text | \
+    local/prepend_words.py | \
+    utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt
+  for set in test train val; do
+    cut -d' ' -f1 data/$set/text > data/$set/ids
+    cut -d' ' -f2- data/$set/text | \
+      local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
+      | sed 's/@@//g' > data/$set/bpe_text
+    mv data/$set/text data/$set/text.old
+    paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Estimating a language model for decoding..."
+  local/train_lm.sh
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: Preparing dictionary and lang..."
+  local/prepare_dict.sh
+  # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations.
+  # So we set --sil-prob to 0.0
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
+                        data/local/dict "<sil>" data/lang/temp data/lang
+  utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
+
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \
+                     data/local/dict/lexicon.txt data/lang
+  utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
+                               data/lang data/lang_rescore_6g
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: Calling the flat-start chain recipe..."
+  local/chain/run_e2e_cnn.sh
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: Aligning the training data using the e2e chain model..."
+  steps/nnet3/align.sh --nj 50 --cmd "$cmd" \
+                       --use-gpu false \
+                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
+                       data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: Building a tree and training a regular chain model using the e2e alignments..."
+  local/chain/run_cnn_e2eali.sh
+fi
diff --git a/egs/iam/v2/steps b/egs/iam/v2/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/iam/v2/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/iam/v2/utils b/egs/iam/v2/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/iam/v2/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/librispeech/s5/local/download_and_untar.sh b/egs/librispeech/s5/local/download_and_untar.sh
index d01e681fed7..1bb6d909edc 100755
--- a/egs/librispeech/s5/local/download_and_untar.sh
+++ b/egs/librispeech/s5/local/download_and_untar.sh
@@ -67,7 +67,9 @@ if [ -f $data/$part.tar.gz ]; then
   fi
 fi
 
-if [ ! -f $data/$part.tar.gz ]; then
+pushd $data
+
+if [ ! -f $part.tar.gz ]; then
   if ! which wget >/dev/null; then
     echo "$0: wget is not installed."
     exit 1;
@@ -75,20 +77,19 @@ if [ ! -f $data/$part.tar.gz ]; then
   full_url=$url/$part.tar.gz
   echo "$0: downloading data from $full_url.  This may take some time, please be patient."
 
-  cd $data
   if ! wget --no-check-certificate $full_url; then
     echo "$0: error executing wget $full_url"
     exit 1;
   fi
 fi
 
-cd $data
-
 if ! tar -xvzf $part.tar.gz; then
   echo "$0: error un-tarring archive $data/$part.tar.gz"
   exit 1;
 fi
 
+popd >&/dev/null
+
 touch $data/LibriSpeech/$part/.complete
 
 echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh
index db9e78a2eac..5d27476d3e1 100755
--- a/egs/madcat_ar/v1/run_end2end.sh
+++ b/egs/madcat_ar/v1/run_end2end.sh
@@ -116,7 +116,8 @@ if [ $stage -le 8 ]; then
   echo "$0: Aligning the training data using the e2e chain model..."
   echo "Date: $(date)."
   steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
-                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                       --use-gpu false \
+                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
                        data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
new file mode 100755
index 00000000000..0b86ace2de1
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -0,0 +1,307 @@
+#!/bin/bash
+
+# run_cnn_tdnn_1a.sh is modified from run_tdnn_1h.sh, but adding CNN layers
+#  near the beginning.
+
+# local/chain/compare_wer.sh --online exp/chain/tdnn1h_sp exp/chain/cnn_tdnn1a_sp
+# System                tdnn1h_sp cnn_tdnn1a_sp
+#WER dev_clean_2 (tgsmall)      12.09     11.15
+#             [online:]         12.11     11.17
+#WER dev_clean_2 (tglarge)       8.59      7.79
+#             [online:]          8.76      7.80
+# Final train prob        -0.0493   -0.0467
+# Final valid prob        -0.0805   -0.0789
+# Final train prob (xent)   -1.1730   -1.0767
+# Final valid prob (xent)   -1.3872   -1.3070
+# Num-params                 5207856   4492816
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/cnn_tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  cnn_opts="l2-regularize=0.03"
+  ivector_affine_opts="l2-regularize=0.03"
+  tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_first_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.0"
+  tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.015"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # this takes the MFCCs and generates filterbank coefficients.  The MFCCs
+  # are more compressible so we prefer to dump the MFCCs to disk rather
+  # than filterbanks.
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+
+  linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0)
+  batchnorm-component name=ivector-batchnorm target-rms=0.025
+
+  batchnorm-component name=idct-batchnorm input=idct
+  combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40
+
+  conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25
+  conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48
+  conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+
+  # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the
+  # information bottleneck doesn't become a problem.  (we use time-stride=0 so no splicing, to
+  # limit the num-parameters).
+  tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=768 bottleneck-dim=192 time-stride=0
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/fs0{1,2}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a54.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a54.sh
deleted file mode 100755
index 00a72371906..00000000000
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a54.sh
+++ /dev/null
@@ -1,462 +0,0 @@
-#!/bin/bash
-
-# 1a54 is as 1a53 but halving the target-rms of the ivector stuff to 0.025.
-# No clear difference.
-
-# local/chain/compare_wer.sh --online exp/chain/tdnn1h_sp exp/chain/cnn_tdnn1a47_sp exp/chain/cnn_tdnn1a47b_sp exp/chain/cnn_tdnn1a53_sp exp/chain/cnn_tdnn1a53b_sp exp/chain/cnn_tdnn1a54_sp exp/chain/cnn_tdnn1a54b_sp
-# System                tdnn1h_sp cnn_tdnn1a47_sp cnn_tdnn1a47b_sp cnn_tdnn1a53_sp cnn_tdnn1a53b_sp cnn_tdnn1a54_sp cnn_tdnn1a54b_sp
-#WER dev_clean_2 (tgsmall)      12.09     11.42     11.58     11.39     11.14     11.15     11.32
-#             [online:]         12.11     11.40     11.51     11.30     11.17     11.17     11.34
-#WER dev_clean_2 (tglarge)       8.59      7.93      7.97      7.81      7.87      7.79      7.73
-#             [online:]          8.76      8.02      8.12      7.72      7.89      7.80      7.74
-# Final train prob        -0.0493   -0.0455   -0.0460   -0.0459   -0.0462   -0.0467   -0.0464
-# Final valid prob        -0.0805   -0.0780   -0.0779   -0.0782   -0.0793   -0.0789   -0.0785
-# Final train prob (xent)   -1.1730   -1.0483   -1.0442   -1.0630   -1.0690   -1.0767   -1.0774
-# Final valid prob (xent)   -1.3872   -1.2909   -1.2944   -1.3032   -1.3110   -1.3070   -1.3113
-# Num-params                 5207856   4591616   4591616   4492816   4492816   4492816   4492816
-
-
-# 1a53 is as 1a47 but removing the previous way we had included the ivector.
-# Better!
-# local/chain/compare_wer.sh --online exp/chain/cnn_tdnn1a47_sp exp/chain/cnn_tdnn1a47b_sp exp/chain/cnn_tdnn1a53_sp exp/chain/cnn_tdnn1a53b_sp
-# System                cnn_tdnn1a47_sp cnn_tdnn1a47b_sp cnn_tdnn1a53_sp cnn_tdnn1a53b_sp
-#WER dev_clean_2 (tgsmall)      11.42     11.58     11.39     11.14
-#             [online:]         11.40     11.51     11.30     11.17
-#WER dev_clean_2 (tglarge)       7.93      7.97      7.81      7.87
-#             [online:]          8.02      8.12      7.72      7.89
-# Final train prob        -0.0455   -0.0460   -0.0459   -0.0462
-# Final valid prob        -0.0780   -0.0779   -0.0782   -0.0793
-# Final train prob (xent)   -1.0483   -1.0442   -1.0630   -1.0690
-# Final valid prob (xent)   -1.2909   -1.2944   -1.3032   -1.3110
-# Num-params                 4591616   4591616   4492816   4492816
-
-
-# 1a47 is as 1a45 but reducing target-rms in the ivector-batchnorm component from
-#  0.1 to 0.05.  Helpful!
-# local/chain/compare_wer.sh --online exp/chain/cnn_tdnn1a30_sp exp/chain/cnn_tdnn1a30b_sp exp/chain/cnn_tdnn1a41_sp exp/chain/cnn_tdnn1a42_sp exp/chain/cnn_tdnn1a45_sp exp/chain/cnn_tdnn1a45b_sp exp/chain/cnn_tdnn1a47_sp exp/chain/cnn_tdnn1a47b_sp
-# System                cnn_tdnn1a30_sp cnn_tdnn1a30b_sp cnn_tdnn1a41_sp cnn_tdnn1a42_sp cnn_tdnn1a45_sp cnn_tdnn1a45b_sp cnn_tdnn1a47_sp cnn_tdnn1a47b_sp
-#WER dev_clean_2 (tgsmall)      12.04     12.01     11.69     11.29     11.73     11.47     11.42     11.58
-#             [online:]         12.00     12.01     11.76     11.36     11.71     11.44     11.40     11.51
-#WER dev_clean_2 (tglarge)       8.36      8.32      8.17      7.94      8.21      8.03      7.93      7.97
-#             [online:]          8.46      8.45      8.27      8.05      8.30      8.07      8.02      8.12
-# Final train prob        -0.0459   -0.0452   -0.0455   -0.0452   -0.0457   -0.0454   -0.0455   -0.0460
-# Final valid prob        -0.0809   -0.0800   -0.0788   -0.0779   -0.0793   -0.0795   -0.0780   -0.0779
-# Final train prob (xent)   -1.0709   -1.0708   -1.0446   -1.0330   -1.0429   -1.0393   -1.0483   -1.0442
-# Final valid prob (xent)   -1.3090   -1.3158   -1.2952   -1.2786   -1.2892   -1.2938   -1.2909   -1.2944
-# Num-params                 4569456   4569456   4690384   4691664   4591616   4591616   4591616   4591616
-
-
-# local/chain/compare_wer.sh --online exp/chain/cnn_tdnn1a30_sp exp/chain/cnn_tdnn1a30b_sp exp/chain/cnn_tdnn1a41_sp exp/chain/cnn_tdnn1a42_sp exp/chain/cnn_tdnn1a45_sp exp/chain/cnn_tdnn1a45b_sp exp/chain/cnn_tdnn1a47_sp
-# System                cnn_tdnn1a30_sp cnn_tdnn1a30b_sp cnn_tdnn1a41_sp cnn_tdnn1a42_sp cnn_tdnn1a45_sp cnn_tdnn1a45b_sp cnn_tdnn1a47_sp
-#WER dev_clean_2 (tgsmall)      12.04     12.01     11.69     11.29     11.73     11.47     11.42
-#             [online:]         12.00     12.01     11.76     11.36     11.71     11.44     11.40
-#WER dev_clean_2 (tglarge)       8.36      8.32      8.17      7.94      8.21      8.03      7.93
-#             [online:]          8.46      8.45      8.27      8.05      8.30      8.07      8.02
-# Final train prob        -0.0459   -0.0452   -0.0455   -0.0452   -0.0457   -0.0454   -0.0455
-# Final valid prob        -0.0809   -0.0800   -0.0788   -0.0779   -0.0793   -0.0795   -0.0780
-# Final train prob (xent)   -1.0709   -1.0708   -1.0446   -1.0330   -1.0429   -1.0393   -1.0483
-# Final valid prob (xent)   -1.3090   -1.3158   -1.2952   -1.2786   -1.2892   -1.2938   -1.2909
-# Num-params                 4569456   4569456   4690384   4691664   4591616   4591616   4591616
-
-
-# local/chain/compare_wer.sh --online exp/chain/cnn_tdnn1a30_sp exp/chain/cnn_tdnn1a30b_sp exp/chain/cnn_tdnn1a41_sp exp/chain/cnn_tdnn1a42_sp exp/chain/cnn_tdnn1a45_sp exp/chain/cnn_tdnn1a45b_sp exp/chain/cnn_tdnn1a47_sp
-# System                cnn_tdnn1a30_sp cnn_tdnn1a30b_sp cnn_tdnn1a41_sp cnn_tdnn1a42_sp cnn_tdnn1a45_sp cnn_tdnn1a45b_sp cnn_tdnn1a47_sp
-#WER dev_clean_2 (tgsmall)      12.04     12.01     11.69     11.29     11.73     11.47     11.42
-#             [online:]         12.00     12.01     11.76     11.36     11.71               11.40
-#WER dev_clean_2 (tglarge)       8.36      8.32      8.17      7.94      8.21      8.03      7.93
-#             [online:]          8.46      8.45      8.27      8.05      8.30                8.02
-# Final train prob        -0.0459   -0.0452   -0.0455   -0.0452   -0.0457   -0.0454   -0.0455
-# Final valid prob        -0.0809   -0.0800   -0.0788   -0.0779   -0.0793   -0.0795   -0.0780
-# Final train prob (xent)   -1.0709   -1.0708   -1.0446   -1.0330   -1.0429   -1.0393   -1.0483
-# Final valid prob (xent)   -1.3090   -1.3158   -1.2952   -1.2786   -1.2892   -1.2938   -1.2909
-# Num-params                 4569456   4569456   4690384   4691664   4591616   4591616   4591616
-
-
-
-# 1a45 is 1a41 but making the ivector adaptation be done before, not after,
-#  the first cnn layer, as additional filters; reverting the
-#  num-filters of the 1st cnn layer from 32 to 48.
-# 1a41 is 1a30 but changing how the ivector adaptation is done (adding another branch);
-#   reducing the 1st num-filters-out from 48 to 32 to save parameters.
-# 1a30 is as 1a29 but adding another cnn layer with subsampling.
-# Promising.
-# local/chain/compare_wer.sh --online exp/chain/tdnn1h_sp exp/chain/tdnn1h2_sp exp/chain/cnn_tdnn1a24_sp exp/chain/cnn_tdnn1a24b_sp exp/chain/cnn_tdnn1a29_sp exp/chain/cnn_tdnn1a29b_sp exp/chain/cnn_tdnn1a30_sp exp/chain/cnn_tdnn1a30b_sp
-# System                tdnn1h_sp tdnn1h2_sp cnn_tdnn1a24_sp cnn_tdnn1a24b_sp cnn_tdnn1a29_sp cnn_tdnn1a29b_sp cnn_tdnn1a30_sp cnn_tdnn1a30b_sp
-#WER dev_clean_2 (tgsmall)      13.18     13.04     11.95     11.86     12.19     11.95     12.04     12.01
-#             [online:]         13.03     12.97     11.99     11.96     12.19     11.94     12.00     12.01
-#WER dev_clean_2 (tglarge)       9.18      9.16      8.57      8.54      8.62      8.45      8.36      8.32
-#             [online:]          9.29      9.24      8.63      8.57      8.68      8.53      8.46      8.45
-# Final train prob        -0.0531   -0.0590   -0.0461   -0.0455   -0.0456   -0.0461   -0.0459   -0.0452
-# Final valid prob        -0.0844   -0.0865   -0.0800   -0.0798   -0.0800   -0.0792   -0.0809   -0.0800
-# Final train prob (xent)   -1.5244   -1.7771   -1.0776   -1.0781   -1.0778   -1.0792   -1.0709   -1.0708
-# Final valid prob (xent)   -1.7447   -1.9611   -1.3131   -1.3190   -1.3153   -1.3210   -1.3090   -1.3158
-# Num-params                 3512112   3512112   4474688   4474688   4495600   4495600   4569456   4569456
-
-# 1a29 is as 1a24 but increasing the num-filters-out for the first two
-#  layers from 32 to 48.
-# 1a24 is as 1a23 but changing offsets for the last cnn layer to be -1,0,1,
-#  as in 14->22.  Better, on average.
-# local/chain/compare_wer.sh --online exp/chain/tdnn1h_sp exp/chain/tdnn1h2_sp exp/chain/cnn_tdnn1a23_sp exp/chain/cnn_tdnn1a23b_sp exp/chain/cnn_tdnn1a24_sp exp/chain/cnn_tdnn1a24b_sp
-# System                tdnn1h_sp tdnn1h2_sp cnn_tdnn1a23_sp cnn_tdnn1a23b_sp cnn_tdnn1a24_sp cnn_tdnn1a24b_sp
-#WER dev_clean_2 (tgsmall)      13.18     13.04     12.15     12.11     11.95     11.86
-#             [online:]         13.03     12.97     12.18     12.07     11.99     11.96
-#WER dev_clean_2 (tglarge)       9.18      9.16      8.57      8.47      8.57      8.54
-#             [online:]          9.29      9.24      8.64      8.50      8.63      8.57
-# Final train prob        -0.0531   -0.0590   -0.0456   -0.0462   -0.0461   -0.0455
-# Final valid prob        -0.0844   -0.0865   -0.0800   -0.0802   -0.0800   -0.0798
-# Final train prob (xent)   -1.5244   -1.7771   -1.0691   -1.0683   -1.0776   -1.0781
-# Final valid prob (xent)   -1.7447   -1.9611   -1.3190   -1.3108   -1.3131   -1.3190
-# Num-params                 3512112   3512112   4474688   4474688   4474688   4474688
-
-# 1a23 is as 1a14 but for the last cnn layer (cnn5), using twice the num-filters
-#  plus subsampling on the output.
-# A bit better, on average!
-# local/chain/compare_wer.sh --online exp/chain/tdnn1h_sp exp/chain/tdnn1h2_sp exp/chain/cnn_tdnn1a14_sp exp/chain/cnn_tdnn1a14b_sp exp/chain/cnn_tdnn1a23_sp exp/chain/cnn_tdnn1a23b_sp
-# System                tdnn1h_sp tdnn1h2_sp cnn_tdnn1a14_sp cnn_tdnn1a14b_sp cnn_tdnn1a23_sp cnn_tdnn1a23b_sp
-#WER dev_clean_2 (tgsmall)      13.18     13.04     12.14     12.39     12.15     12.11
-#             [online:]         13.03     12.97     12.10     12.38     12.18     12.07
-#WER dev_clean_2 (tglarge)       9.18      9.16      8.44      8.69      8.57      8.47
-#             [online:]          9.29      9.24      8.58      8.81      8.64      8.50
-# Final train prob        -0.0531   -0.0590   -0.0455   -0.0460   -0.0456   -0.0462
-# Final valid prob        -0.0844   -0.0865   -0.0806   -0.0802   -0.0800   -0.0802
-# Final train prob (xent)   -1.5244   -1.7771   -1.0792   -1.0763   -1.0691   -1.0683
-# Final valid prob (xent)   -1.7447   -1.9611   -1.3221   -1.3173   -1.3190   -1.3108
-# Num-params                 3512112   3512112   4456224   4456224   4474688   4474688
-
-# 1a14 is as 1a13 but with an extra tdnn-f layer.  Better!
-# local/chain/compare_wer.sh --online exp/chain/tdnn1h_sp exp/chain/tdnn1h2_sp exp/chain/cnn_tdnn1a13_sp exp/chain/cnn_tdnn1a14_sp
-# System                tdnn1h_sp tdnn1h2_sp cnn_tdnn1a13_sp cnn_tdnn1a14_sp
-#WER dev_clean_2 (tgsmall)      13.18     13.04     12.21     12.14
-#             [online:]         13.03     12.97     12.26     12.10
-#WER dev_clean_2 (tglarge)       9.18      9.16      8.65      8.44
-#             [online:]          9.29      9.24      8.67      8.58
-# Final train prob        -0.0531   -0.0590   -0.0459   -0.0455
-# Final valid prob        -0.0844   -0.0865   -0.0810   -0.0806
-# Final train prob (xent)   -1.5244   -1.7771   -1.0901   -1.0792
-# Final valid prob (xent)   -1.7447   -1.9611   -1.3328   -1.3221
-# Num-params                 3512112   3512112   4160544   4456224
-
-# 1a13 is as 1a12 but using the same l2 values for the first layers as for the
-#   later ones (more l2).
-# 1a12 is as 1a11 but making the first TDNN-F layer non-splicing and restoring
-#  the 640's to 768's.
-# 1a11 is as 1a10 but adding some l2 to the CNN layers and to the TDNN layers
-#  for the ivector training.
-# run_cnn_tdnn_1a10.sh is as run_cnn_tdnn_1a.sh but reducing the 768's to 640
-#  to make the num-params similar to the tdnn1h experiment (run_cnn_tdnn_1a.sh was overfitting
-#  a bit).
-#
-# run_cnn_tdnn_1a.sh is modified from run_tdnn_1h.sh, but adding CNN layers
-#  near the beginning.
-
-# 1h is as 1g but a re-tuned model based on resnet-style TDNN-F layers with
-# bypass connections.  Below, 1h2 is just a rerun of 1h with a different --affix
-# option, to give some idea of the run-to-run variation.
-
-# local/chain/compare_wer.sh --online exp/chain/tdnn1g_sp exp/chain/tdnn1h_sp exp/chain/tdnn1h2_sp
-# System                tdnn1g_sp tdnn1h_sp tdnn1h2_sp
-#WER dev_clean_2 (tgsmall)      13.50     13.18     13.04
-#             [online:]         13.52     13.03     12.97
-#WER dev_clean_2 (tglarge)       9.79      9.18      9.16
-#             [online:]          9.79      9.29      9.24
-# Final train prob        -0.0460   -0.0531   -0.0590
-# Final valid prob        -0.0892   -0.0844   -0.0865
-# Final train prob (xent)   -1.1739   -1.5244   -1.7771
-# Final valid prob (xent)   -1.4487   -1.7447   -1.9611
-# Num-params                 6234672   3512112   3512112
-
-# steps/info/chain_dir_info.pl  exp/chain/tdnn1{g,h,h2}_sp
-# exp/chain/tdnn1g_sp: num-iters=25 nj=2..5 num-params=6.2M dim=40+100->2328 combine=-0.056->-0.055 (over 3) xent:train/valid[15,24,final]=(-1.50,-1.23,-1.17/-1.73,-1.52,-1.45) logprob:train/valid[15,24,final]=(-0.063,-0.051,-0.046/-0.101,-0.094,-0.089)
-# exp/chain/tdnn1h_sp: num-iters=34 nj=2..5 num-params=3.5M dim=40+100->2328 combine=-0.055->-0.050 (over 4) xent:train/valid[21,33,final]=(-1.97,-1.57,-1.52/-2.11,-1.78,-1.74) logprob:train/valid[21,33,final]=(-0.080,-0.061,-0.053/-0.106,-0.096,-0.084)
-# exp/chain/tdnn1h2_sp: num-iters=34 nj=2..5 num-params=3.5M dim=40+100->2328 combine=-0.062->-0.056 (over 4) xent:train/valid[21,33,final]=(-2.21,-1.78,-1.78/-2.34,-1.96,-1.96) logprob:train/valid[21,33,final]=(-0.086,-0.066,-0.059/-0.110,-0.098,-0.087)
-
-# Set -e here so that we catch if any executable fails immediately
-set -euo pipefail
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=0
-decode_nj=10
-train_set=train_clean_5
-test_sets=dev_clean_2
-gmm=tri3b
-nnet3_affix=
-
-# The rest are configs specific to this script.  Most of the parameters
-# are just hardcoded at this level, in the commands below.
-affix=1a54   # affix for the TDNN directory name
-tree_affix=
-train_stage=-10
-get_egs_stage=-10
-decode_iter=
-
-# training options
-# training chunk-options
-chunk_width=140,100,160
-dropout_schedule='0,0@0.20,0.3@0.50,0'
-common_egs_dir=
-xent_regularize=0.1
-
-# training options
-srand=0
-remove_egs=true
-reporting_email=
-
-#decode options
-test_online_decoding=true  # if true, it will run the last decoding stage.
-
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-# The iVector-extraction and feature-dumping parts are the same as the standard
-# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
-# run those things.
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --nnet3-affix "$nnet3_affix" || exit 1;
-
-# Problem: We have removed the "train_" prefix of our training set in
-# the alignment directory names! Bad!
-gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp
-tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
-lang=data/lang_chain
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
-dir=exp/chain${nnet3_affix}/cnn_tdnn${affix}_sp
-train_data_dir=data/${train_set}_sp_hires
-lores_train_data_dir=data/${train_set}_sp
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
-
-for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-if [ $stage -le 10 ]; then
-  echo "$0: creating lang directory $lang with chain-type topology"
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  if [ -d $lang ]; then
-    if [ $lang/L.fst -nt data/lang/L.fst ]; then
-      echo "$0: $lang already exists, not overwriting it; continuing"
-    else
-      echo "$0: $lang already exists and seems to be older than data/lang..."
-      echo " ... not sure what to do.  Exiting."
-      exit 1;
-    fi
-  else
-    cp -r data/lang $lang
-    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
-    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
-    # Use our special topology... note that later on may have to tune this
-    # topology.
-    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
-  fi
-fi
-
-if [ $stage -le 11 ]; then
-  # Get the alignments as lattices (gives the chain training more freedom).
-  # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang $gmm_dir $lat_dir
-  rm $lat_dir/fsts.*.gz # save space
-fi
-
-if [ $stage -le 12 ]; then
-  # Build a tree using our new topology.  We know we have alignments for the
-  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
-  # those.  The num-leaves is always somewhat less than the num-leaves from
-  # the GMM baseline.
-   if [ -f $tree_dir/final.mdl ]; then
-     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
-     exit 1;
-  fi
-  steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor 3 \
-    --context-opts "--context-width=2 --central-position=1" \
-    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
-    $lang $ali_dir $tree_dir
-fi
-
-
-if [ $stage -le 13 ]; then
-  mkdir -p $dir
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-
-  cnn_opts="l2-regularize=0.03"
-  ivector_affine_opts="l2-regularize=0.03"
-  tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
-  tdnnf_first_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.0"
-  tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
-  linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
-  prefinal_opts="l2-regularize=0.03"
-  output_opts="l2-regularize=0.015"
-
-  mkdir -p $dir/configs
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # this takes the MFCCs and generates filterbank coefficients.  The MFCCs
-  # are more compressible so we prefer to dump the MFCCs to disk rather
-  # than filterbanks.
-  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
-
-  linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0)
-  batchnorm-component name=ivector-batchnorm target-rms=0.025
-
-  batchnorm-component name=idct-batchnorm input=idct
-  combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40
-
-  conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25
-  conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48
-  conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
-  conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
-  conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
-  conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
-
-  # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the
-  # information bottleneck doesn't become a problem.  (we use time-stride=0 so no splicing, to
-  # limit the num-parameters).
-  tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=768 bottleneck-dim=192 time-stride=0
-  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
-  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
-  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
-  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
-  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
-  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
-  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
-  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
-  linear-component name=prefinal-l dim=192 $linear_opts
-
-  ## adding the layers for chain branch
-  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
-  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
-
-  # adding the layers for xent branch
-  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-
-if [ $stage -le 14 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/fs0{1,2}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/chain/train.py --stage=$train_stage \
-    --cmd="$decode_cmd" \
-    --feat.online-ivector-dir=$train_ivector_dir \
-    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize $xent_regularize \
-    --chain.leaky-hmm-coefficient=0.1 \
-    --chain.l2-regularize=0.0 \
-    --chain.apply-deriv-weights=false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --trainer.dropout-schedule $dropout_schedule \
-    --trainer.add-option="--optimization.memory-compression-level=2" \
-    --trainer.srand=$srand \
-    --trainer.max-param-change=2.0 \
-    --trainer.num-epochs=20 \
-    --trainer.frames-per-iter=3000000 \
-    --trainer.optimization.num-jobs-initial=2 \
-    --trainer.optimization.num-jobs-final=5 \
-    --trainer.optimization.initial-effective-lrate=0.002 \
-    --trainer.optimization.final-effective-lrate=0.0002 \
-    --trainer.num-chunk-per-minibatch=128,64 \
-    --egs.chunk-width=$chunk_width \
-    --egs.dir="$common_egs_dir" \
-    --egs.opts="--frames-overlap-per-eg 0" \
-    --cleanup.remove-egs=$remove_egs \
-    --use-gpu=true \
-    --reporting.email="$reporting_email" \
-    --feat-dir=$train_data_dir \
-    --tree-dir=$tree_dir \
-    --lat-dir=$lat_dir \
-    --dir=$dir  || exit 1;
-fi
-
-if [ $stage -le 15 ]; then
-  # Note: it's not important to give mkgraph.sh the lang directory with the
-  # matched topology (since it gets the topology file from the model).
-  utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test_tgsmall \
-    $tree_dir $tree_dir/graph_tgsmall || exit 1;
-fi
-
-if [ $stage -le 16 ]; then
-  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  rm $dir/.error 2>/dev/null || true
-
-  for data in $test_sets; do
-    (
-      nspk=$(wc -l <data/${data}_hires/spk2utt)
-      steps/nnet3/decode.sh \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --frames-per-chunk $frames_per_chunk \
-          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
-          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-        data/lang_test_{tgsmall,tglarge} \
-       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
-fi
-
-# Not testing the 'looped' decoding separately, because for
-# TDNN systems it would give exactly the same results as the
-# normal decoding.
-
-if $test_online_decoding && [ $stage -le 17 ]; then
-  # note: if the features change (e.g. you add pitch features), you will have to
-  # change the options of the following command line.
-  steps/online/nnet3/prepare_online_decoding.sh \
-    --mfcc-config conf/mfcc_hires.conf \
-    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
-
-  rm $dir/.error 2>/dev/null || true
-
-  for data in $test_sets; do
-    (
-      nspk=$(wc -l <data/${data}_hires/spk2utt)
-      # note: we just give it "data/${data}" as it only uses the wav.scp, the
-      # feature type does not matter.
-      steps/online/nnet3/decode.sh \
-        --acwt 1.0 --post-decode-acwt 10.0 \
-        --nj $nspk --cmd "$decode_cmd" \
-        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-        data/lang_test_{tgsmall,tglarge} \
-       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
-fi
-
-
-exit 0;
diff --git a/egs/tedlium/s5_r3/.gitignore b/egs/tedlium/s5_r3/.gitignore
new file mode 100644
index 00000000000..65eef93d691
--- /dev/null
+++ b/egs/tedlium/s5_r3/.gitignore
@@ -0,0 +1 @@
+db
diff --git a/egs/tedlium/s5_r3/local/download_data.sh b/egs/tedlium/s5_r3/local/download_data.sh
index 49de5b12372..c51effdd6fa 100755
--- a/egs/tedlium/s5_r3/local/download_data.sh
+++ b/egs/tedlium/s5_r3/local/download_data.sh
@@ -21,7 +21,9 @@ else
     # the following command won't re-get it if it's already there
     # because of the --continue switch.
     wget --continue http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz || exit 1
-    tar xf "TEDLIUM_release-3.tar.gz"
+    
+    echo "$0: extracting TEDLIUM_release-3 data"
+    tar xf "TEDLIUM_release-3.tgz"
   else
     echo "$0: not downloading or un-tarring TEDLIUM_release2 because it already exists."
   fi
diff --git a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b17.sh b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1c.sh
similarity index 66%
rename from egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b17.sh
rename to egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1c.sh
index 3f6d8260932..dc47681593f 100755
--- a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b17.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1c.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 
-# 1b17 is as 1b16 but taking the first layers from the 1a54 setup in mini_librispeech.
-# A little better than the baseline.  Overfits more.
+# 1c is as 1b but taking the first layers from the cnn_tdnn_1a setup in mini_librispeech.
+# A little better than the baseline and overfits more.
 #
-# local/chain/compare_wer.sh exp/chain/tdnn1g_sp exp/chain/cnn_tdnn1b17_sp
-# System                tdnn1g_sp cnn_tdnn1b17_sp
+# local/chain/compare_wer.sh exp/chain/tdnn1g_sp exp/chain/cnn_tdnn1c_sp
+# System                tdnn1g_sp cnn_tdnn1c_sp
 #WER dev93 (tgpr)                6.68      6.55
 #WER dev93 (tg)                  6.57      6.49
 #WER dev93 (big-dict,tgpr)       4.60      4.52
@@ -19,100 +19,6 @@
 # Final valid prob (xent)   -0.6882   -0.6591
 # Num-params                 8354636   6935084
 
-# 1b16 is like 1b12 but taking the cnn-layer part from the 1a30 setup in mini-librispeech,
-# and adding another TDNN-F layer with splicing 3.
-# Doesn't seem helpful.  This setup seems very vulnerable to overfitting.
-#
-# local/chain/compare_wer.sh exp/chain/tdnn1g_sp exp/chain/cnn_tdnn1b_sp exp/chain/cnn_tdnn1b10_sp exp/chain/cnn_tdnn1b11_sp exp/chain/cnn_tdnn1b12_sp exp/chain/cnn_tdnn1b13_sp
-# System                tdnn1g_sp cnn_tdnn1b_sp cnn_tdnn1b10_sp cnn_tdnn1b11_sp cnn_tdnn1b12_sp cnn_tdnn1b13_sp
-#WER dev93 (tgpr)                6.68      8.19      7.85      6.95      6.58      6.73
-#WER dev93 (tg)                  6.57      7.76      7.49      6.98      6.59      6.59
-#WER dev93 (big-dict,tgpr)       4.60      6.06      5.93      4.87      4.69      4.86
-#WER dev93 (big-dict,fg)         4.26      5.49      5.15      4.62      4.30      4.35
-#WER eval92 (tgpr)               4.54      5.69      5.58      4.59      4.59      4.73
-#WER eval92 (tg)                 4.32      5.12      5.30      4.29      4.36      4.36
-#WER eval92 (big-dict,tgpr)      2.62      3.47      3.56      2.60      2.59      2.68
-#WER eval92 (big-dict,fg)        2.32      3.10      3.01      2.13      2.22      2.27
-# Final train prob        -0.0417   -0.0469   -0.0463   -0.0401   -0.0414   -0.0401
-# Final valid prob        -0.0487   -0.0663   -0.0668   -0.0504   -0.0483   -0.0486
-# Final train prob (xent)   -0.6461   -0.8759   -0.8764   -0.6210   -0.6353   -0.6173
-# Final valid prob (xent)   -0.6882   -1.0003   -0.9906   -0.6880   -0.6857   -0.6623
-# Num-params                 8354636   5470268   5470268   6337852   6385980   6571836
-
-# 1b12 is like 1b11 but making various changes that were helpful in the mini-librispeech
-#  setup: using the same l2 values for the early layers; not doing splicing in the first
-#  TDNN-F layer; and adding an extra TDNN-F layer.
-# It's now about the same as tdnn1g_sp.
-#
-# local/chain/compare_wer.sh exp/chain/tdnn1g_sp exp/chain/cnn_tdnn1b_sp exp/chain/cnn_tdnn1b10_sp exp/chain/cnn_tdnn1b11_sp exp/chain/cnn_tdnn1b12_sp
-# System                tdnn1g_sp cnn_tdnn1b_sp cnn_tdnn1b10_sp cnn_tdnn1b11_sp cnn_tdnn1b12_sp
-#WER dev93 (tgpr)                6.68      8.19      7.85      6.95      6.58
-#WER dev93 (tg)                  6.57      7.76      7.49      6.98      6.59
-#WER dev93 (big-dict,tgpr)       4.60      6.06      5.93      4.87      4.69
-#WER dev93 (big-dict,fg)         4.26      5.49      5.15      4.62      4.30
-#WER eval92 (tgpr)               4.54      5.69      5.58      4.59      4.59
-#WER eval92 (tg)                 4.32      5.12      5.30      4.29      4.36
-#WER eval92 (big-dict,tgpr)      2.62      3.47      3.56      2.60      2.59
-#WER eval92 (big-dict,fg)        2.32      3.10      3.01      2.13      2.22
-# Final train prob        -0.0417   -0.0469   -0.0463   -0.0401   -0.0414
-# Final valid prob        -0.0487   -0.0663   -0.0668   -0.0504   -0.0483
-# Final train prob (xent)   -0.6461   -0.8759   -0.8764   -0.6210   -0.6353
-# Final valid prob (xent)   -0.6882   -1.0003   -0.9906   -0.6880   -0.6857
-# Num-params                 8354636   5470268   5470268   6337852   6385980
-#
-# 1b11 is like 1b10 but taking options and resnet-style TDNN-F configuration from tdnn_1g.sh.
-#  (using slightly fewer epochs than 1g since the frames-per-minibatch is smaller here).
-#   (re-dumped egs into 1b11b due to disk crash of b03).
-# It's better than the previous cnn_tdnn experiments but not yet better than tdnn1g.
-# local/chain/compare_wer.sh exp/chain/tdnn1g_sp exp/chain/cnn_tdnn1b_sp exp/chain/cnn_tdnn1b10_sp exp/chain/cnn_tdnn1b11_sp
-# System                tdnn1g_sp cnn_tdnn1b_sp cnn_tdnn1b10_sp cnn_tdnn1b11_sp
-#WER dev93 (tgpr)                6.68      8.19      7.85      6.95
-#WER dev93 (tg)                  6.57      7.76      7.49      6.98
-#WER dev93 (big-dict,tgpr)       4.60      6.06      5.93      4.87
-#WER dev93 (big-dict,fg)         4.26      5.49      5.15      4.62
-#WER eval92 (tgpr)               4.54      5.69      5.58      4.59
-#WER eval92 (tg)                 4.32      5.12      5.30      4.29
-#WER eval92 (big-dict,tgpr)      2.62      3.47      3.56      2.60
-#WER eval92 (big-dict,fg)        2.32      3.10      3.01      2.13
-# Final train prob        -0.0417   -0.0469   -0.0463   -0.0401
-# Final valid prob        -0.0487   -0.0663   -0.0668   -0.0504
-# Final train prob (xent)   -0.6461   -0.8759   -0.8764   -0.6210
-# Final valid prob (xent)   -0.6882   -1.0003   -0.9906   -0.6880
-# Num-params                 8354636   5470268   5470268   6337852
-
-#
-# 1b10 is like 1b but adding a batchnorm-component before the first CNN layer.
-
-# 1b is like 1a, but converting the batch-norm layers in all but the CNN
-# components back into renorm layers.
-# Note: I'm not confident that the differences from 1a are entirely due
-# to this change, as there have also been code changes, about how the
-# combination works.
-
-# exp/chain/tdnn1g_sp: num-iters=108 nj=2..8 num-params=8.4M dim=40+100->2854 combine=-0.042->-0.042 (over 2) xent:train/valid[71,107,final]=(-0.975,-0.640,-0.646/-0.980,-0.678,-0.688) logprob:train/valid[71,107,final]=(-0.067,-0.043,-0.042/-0.069,-0.050,-0.049)
-# exp/chain/cnn_tdnn1b17_sp: num-iters=144 nj=2..8 num-params=6.9M dim=40+100->2854 combine=-0.041->-0.041 (over 3) xent:train/valid[95,143,final]=(-0.866,-0.617,-0.620/-0.881,-0.657,-0.659) logprob:train/valid[95,143,final]=(-0.061,-0.042,-0.041/-0.062,-0.050,-0.049)
-
-# The following table compares chain (TDNN+LSTM, TDNN, CNN+TDNN).
-# The CNN+TDNN doesn't seem to have any advantages versus the TDNN (and it's
-# about 5 times slower per iteration).  But it's not well tuned.
-# And the num-params is fewer (5.5M vs 7.6M for TDNN).
-
-# local/chain/compare_wer.sh exp/chain/tdnn_lstm1a_sp exp/chain/tdnn1a_sp exp/chain/cnn_tdnn1a_sp
-# System                tdnn_lstm1a_sp tdnn1a_sp cnn_tdnn1a_sp
-#WER dev93 (tgpr)                7.48      7.87      9.02
-#WER dev93 (tg)                  7.41      7.61      8.60
-#WER dev93 (big-dict,tgpr)       5.64      5.71      6.97
-#WER dev93 (big-dict,fg)         5.40      5.10      6.12
-#WER eval92 (tgpr)               5.67      5.23      5.56
-#WER eval92 (tg)                 5.46      4.87      5.05
-#WER eval92 (big-dict,tgpr)      3.69      3.24      3.40
-#WER eval92 (big-dict,fg)        3.28      2.71      2.73
-# Final train prob        -0.0341   -0.0414   -0.0532
-# Final valid prob        -0.0506   -0.0634   -0.0752
-# Final train prob (xent)   -0.5643   -0.8216   -1.0857
-# Final valid prob (xent)   -0.6648   -0.9208   -1.1505
-
-
 
 set -e -o pipefail
 
@@ -128,7 +34,7 @@ num_threads_ubm=32
 nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
 
 # Options which are not passed through to run_ivector_common.sh
-affix=1b17  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+affix=1c  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
 common_egs_dir=
 reporting_email=
 
diff --git a/egs/wsj/s5/steps/align_basis_fmllr.sh b/egs/wsj/s5/steps/align_basis_fmllr.sh
index 417fbc96c8a..e5510c5ab7e 100755
--- a/egs/wsj/s5/steps/align_basis_fmllr.sh
+++ b/egs/wsj/s5/steps/align_basis_fmllr.sh
@@ -20,6 +20,7 @@ cmd=run.pl
 use_graphs=false
 # Begin configuration.
 scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+basis_fmllr_opts="--fmllr-min-count=22  --num-iters=10 --size-scale=0.2 --step-size-iters=3"
 beam=10
 retry_beam=40
 boost_silence=1.5 # factor by which to boost silence during alignment.
@@ -136,22 +137,20 @@ if [ $stage -le 2 ]; then
       ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
       weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
       gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \
-      gmm-est-basis-fmllr-gpost --fmllr-min-count=22  --num-iters=10 \
-        --size-scale=0.2 --step-size-iters=3 \
-        --write-weights=ark:$dir/pre_wgt.JOB \
+      gmm-est-basis-fmllr-gpost $basis_fmllr_opts --spk2utt=ark:$sdata/JOB/spk2utt \
         $mdl $srcdir/fmllr.basis "$sifeats"  ark,s,cs:- \
         ark:$dir/trans.JOB || exit 1;
-#  else
-#    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
-#      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
-#      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
-#      gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
-#      --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \
-#      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
+  else
+    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+      gmm-est-basis-fmllr $basis_fmllr_opts --spk2utt=ark:$sdata/JOB/spk2utt \
+         $mdl $srcdir/fmllr.basis "$sifeats" \
+        ark,s,cs:- ark:$dir/trans.JOB || exit 1;
   fi
 fi
 
-feats="$sifeats transform-feats ark:$dir/trans.JOB ark:- ark:- |"
+feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
 
 if [ $stage -le 3 ]; then
   echo "$0: doing final alignment."
diff --git a/egs/wsj/s5/steps/align_basis_fmllr_lats.sh b/egs/wsj/s5/steps/align_basis_fmllr_lats.sh
new file mode 100755
index 00000000000..426168496cc
--- /dev/null
+++ b/egs/wsj/s5/steps/align_basis_fmllr_lats.sh
@@ -0,0 +1,184 @@
+#!/bin/bash
+#
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# Version of align_fmllr_lats.sh that uses "basis fMLLR", so it is suitable for
+# situations where there is very little data per speaker (e.g. when there is a
+# one-to-one mapping between utterances and speakers).  Intended for use where
+# the model was trained with basis-fMLLR (i.e.  when you trained the model with
+# train_sat_basis.sh where you normally would have trained with train_sat.sh),
+# or when it was trained with SAT but you ran get_fmllr_basis.sh on the
+# source-model directory.
+
+# Begin configuration section.
+stage=0
+nj=4
+cmd=run.pl
+# Begin configuration.
+scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+acoustic_scale=0.1
+beam=10
+retry_beam=40
+final_beam=20  # For the lattice-generation phase there is no retry-beam.  This
+               # is a limitation of gmm-latgen-faster.  We just use an
+               # intermediate beam.  We'll lose a little data and it will be
+               # slightly slower.  (however, the min-active of 200 that
+               # gmm-latgen-faster defaults to may help.)
+boost_silence=1.0 # factor by which to boost silence during alignment.
+basis_fmllr_opts="--fmllr-min-count=22  --num-iters=10 --size-scale=0.2 --step-size-iters=3"
+
+generate_ali_from_lats=false # If true, alingments generated from lattices.
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "usage: steps/align_fmllr_lats.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
+   echo "e.g.:  steps/align_fmllr_lats.sh data/train data/lang exp/tri1 exp/tri1_lats"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+if [ ! -f $srcdir/fmllr.basis ]; then
+  echo "$0: expected $srcdir/fmllr.basis to exist.   Run get_fmllr_basis.sh on $srcdir."
+fi
+
+for f in $data/feats.scp $lang/phones.txt $srcdir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+oov=`cat $lang/oov.int` || exit 1;
+silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+sdata=$data/split$nj
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
+cp $lang/phones.txt $dir || exit 1;
+
+cp $srcdir/{tree,final.mdl} $dir || exit 1;
+cp $srcdir/final.alimdl $dir 2>/dev/null
+cp $srcdir/final.occs $dir;
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
+cp $srcdir/delta_opts $dir 2>/dev/null
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $dir
+    cp $srcdir/full.mat $dir 2>/dev/null
+   ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+## Set up model and alignment model.
+mdl=$srcdir/final.mdl
+if [ -f $srcdir/final.alimdl ]; then
+  alimdl=$srcdir/final.alimdl
+else
+  alimdl=$srcdir/final.mdl
+fi
+[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
+alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |"
+mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |"
+
+
+## because gmm-latgen-faster doesn't support adding the transition-probs to the
+## graph itself, we need to bake them into the compiled graphs.  This means we can't reuse previously compiled graphs,
+## because the other scripts write them without transition probs.
+if [ $stage -le 0 ]; then
+  echo "$0: compiling training graphs"
+  tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
+  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
+    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $scale_opts $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
+    "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
+fi
+
+
+if [ $stage -le 1 ]; then
+  # Note: we need to set --transition-scale=0.0 --self-loop-scale=0.0 because,
+  # as explained above, we compiled the transition probs into the training
+  # graphs.
+  echo "$0: aligning data in $data using $alimdl and speaker-independent features."
+  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
+    gmm-align-compiled --transition-scale=0.0 --self-loop-scale=0.0 --acoustic-scale=$acoustic_scale \
+        --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
+    "ark:gunzip -c $dir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: computing fMLLR transforms"
+  if [ "$alimdl" != "$mdl" ]; then
+    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+      gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \
+      gmm-est-basis-fmllr-gpost $basis_fmllr_opts \
+      --spk2utt=ark:$sdata/JOB/spk2utt $mdl $srcdir/fmllr.basis "$sifeats" \
+      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
+  else
+    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+      gmm-est-basis-fmllr $basis_fmllr_opts \
+      --spk2utt=ark:$sdata/JOB/spk2utt $mdl $srcdir/fmllr.basis "$sifeats" \
+      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
+  fi
+fi
+
+feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
+
+if [ $stage -le 3 ]; then
+  # Warning: gmm-latgen-faster doesn't support a retry-beam so you may get more
+  # alignment errors (however, it does have a default min-active=200 so this
+  # will tend to reduce alignment errors).
+  # --allow_partial=false makes sure we reach the end of the decoding graph.
+  # --word-determinize=false makes sure we retain the alternative pronunciations of
+  #   words (including alternatives regarding optional silences).
+  #  --lattice-beam=$beam keeps all the alternatives that were within the beam,
+  #    it means we do no pruning of the lattice (lattices from a training transcription
+  #    will be small anyway).
+  echo "$0: generating lattices containing alternate pronunciations."
+  $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \
+    gmm-latgen-faster --acoustic-scale=$acoustic_scale --beam=$final_beam \
+        --lattice-beam=$final_beam --allow-partial=false --word-determinize=false \
+      "$mdl_cmd" "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
+      "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
+fi
+
+if [ $stage -le 4 ] && $generate_ali_from_lats; then
+  # If generate_alignments is true, ali.*.gz is generated in lats dir
+  $cmd JOB=1:$nj $dir/log/generate_alignments.JOB.log \
+    lattice-best-path --acoustic-scale=$acoustic_scale "ark:gunzip -c $dir/lat.JOB.gz |" \
+    ark:/dev/null "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+fi
+
+rm $dir/pre_ali.*.gz 2>/dev/null || true
+
+echo "$0: done generating lattices from training transcripts."
+
+utils/summarize_warnings.pl $dir/log
+
+exit 0;
diff --git a/egs/wsj/s5/steps/align_fmllr_lats.sh b/egs/wsj/s5/steps/align_fmllr_lats.sh
index 187d9bf5687..b47b97ef994 100755
--- a/egs/wsj/s5/steps/align_fmllr_lats.sh
+++ b/egs/wsj/s5/steps/align_fmllr_lats.sh
@@ -5,7 +5,7 @@
 
 # Version of align_fmllr.sh that generates lattices (lat.*.gz) with
 # alignments of alternative pronunciations in them.  Mainly intended
-# as a precursor to CTC training for now.
+# as a precursor to LF-MMI/chain training for now.
 
 # Begin configuration section.
 stage=0
diff --git a/egs/wsj/s5/steps/cleanup/decode_fmllr_segmentation.sh b/egs/wsj/s5/steps/cleanup/decode_fmllr_segmentation.sh
index de99fd8e624..d1297ccd836 100755
--- a/egs/wsj/s5/steps/cleanup/decode_fmllr_segmentation.sh
+++ b/egs/wsj/s5/steps/cleanup/decode_fmllr_segmentation.sh
@@ -111,7 +111,7 @@ delta_opts=`cat $srcdir/delta_opts 2>/dev/null` || true
 
 silphonelist=`cat $graphdir/phones/silence.csl` || exit 1;
 
-utils/lang/check_phones_compatible.sh $graph_dir/phones.txt $srcdir/phones.txt
+utils/lang/check_phones_compatible.sh $graphdir/phones.txt $srcdir/phones.txt
 
 # Some checks.  Note: we don't need $srcdir/tree but we expect
 # it should exist, given the current structure of the scripts.
diff --git a/egs/wsj/s5/steps/diagnostic/analyze_alignments.sh b/egs/wsj/s5/steps/diagnostic/analyze_alignments.sh
index b641cd18cbb..ff0a87ae295 100755
--- a/egs/wsj/s5/steps/diagnostic/analyze_alignments.sh
+++ b/egs/wsj/s5/steps/diagnostic/analyze_alignments.sh
@@ -44,7 +44,7 @@ $cmd JOB=1:$num_jobs $dir/log/get_phone_alignments.JOB.log \
   set -o pipefail '&&' ali-to-phones --write-lengths=true "$model"  \
       "ark:gunzip -c $dir/ali.JOB.gz|" ark,t:- \| \
    sed -E 's/^[^ ]+ //' \| \
-   awk 'BEGIN{FS=" ; "; OFS="\n";} {print "begin " $1; print "end " $NF; for (n=1;n<=NF;n++) print "all " $n; }' \| \
+   awk 'BEGIN{FS=" ; "; OFS="\n";} {print "begin " $1; if (NF>1) print "end " $NF; for (n=1;n<=NF;n++) print "all " $n; }' \| \
    sort \| uniq -c \| gzip -c '>' $dir/phone_stats.JOB.gz || exit 1
 
 if ! $cmd $dir/log/analyze_alignments.log \
diff --git a/egs/wsj/s5/steps/diagnostic/analyze_lats.sh b/egs/wsj/s5/steps/diagnostic/analyze_lats.sh
index 98b33d9d09d..d580f516527 100755
--- a/egs/wsj/s5/steps/diagnostic/analyze_lats.sh
+++ b/egs/wsj/s5/steps/diagnostic/analyze_lats.sh
@@ -51,7 +51,7 @@ $cmd JOB=1:$num_jobs $dir/log/lattice_best_path.JOB.log \
 $cmd JOB=1:$num_jobs $dir/log/get_lattice_stats.JOB.log \
   ali-to-phones --write-lengths=true "$model" "ark:gunzip -c $dir/ali_tmp.JOB.gz|" ark,t:- \| \
   sed -E 's/^[^ ]+ //' \| \
-  awk 'BEGIN{FS=" ; "; OFS="\n";} {print "begin " $1; print "end " $NF; for (n=1;n<=NF;n++) print "all " $n; }' \| \
+  awk 'BEGIN{FS=" ; "; OFS="\n";} {print "begin " $1; if (NF>1) print "end " $NF; for (n=1;n<=NF;n++) print "all " $n; }' \| \
   sort \| uniq -c \| gzip -c '>' $dir/phone_stats.JOB.gz || exit 1
 
 
diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py
index 1e8e2ced6ce..503721c23d1 100644
--- a/egs/wsj/s5/steps/libs/common.py
+++ b/egs/wsj/s5/steps/libs/common.py
@@ -18,6 +18,11 @@
 import sys
 import threading
 
+try:
+    import thread as thread_module
+except:
+    import _thread as thread_module
+
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
 
@@ -230,8 +235,7 @@ def background_command_waiter(command, popen_object, require_zero_status):
             logger.error(str)
             # thread.interrupt_main() sends a KeyboardInterrupt to the main
             # thread, which will generally terminate the program.
-            import thread
-            thread.interrupt_main()
+            thread_module.interrupt_main()
         else:
             logger.warning(str)
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
index 73f4e5b6533..1afc26ff163 100755
--- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
+++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
@@ -532,7 +532,7 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"):
         try:
             report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]),
                                                   x[1], x[2], x[2]-x[1]))
-        except KeyError, IndexError:
+        except (KeyError, IndexError):
             continue
 
     total_time = 0
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 229f290e94c..6afb43824fd 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -167,7 +167,7 @@ def train_new_models(dir, iter, srand, num_jobs,
         # work out the 1-based archive index.
         archive_index = (k % num_archives) + 1
         # previous : frame_shift = (k/num_archives) % frame_subsampling_factor
-        frame_shift = ((archive_index + k/num_archives)
+        frame_shift = ((archive_index + k//num_archives)
                        % frame_subsampling_factor)
 
         multitask_egs_opts = common_train_lib.get_multitask_egs_opts(
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 720164e5436..d052c78b3f8 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -288,7 +288,7 @@ def halve_range_str(range_str):
     halved_ranges = []
     for r in ranges:
         # a range may be either e.g. '64', or '128:256'
-        c = [str(max(1, int(x)/2)) for x in r.split(":")]
+        c = [str(max(1, int(x)//2)) for x in r.split(":")]
         halved_ranges.append(":".join(c))
     return ','.join(halved_ranges)
 
@@ -591,7 +591,7 @@ def get_model_combine_iters(num_iters, num_epochs,
         models_to_combine.add(num_iters)
     else:
         subsample_model_factor = 1
-        num_iters_combine = min(max_models_combine, num_iters/2)
+        num_iters_combine = min(max_models_combine, num_iters//2)
         models_to_combine = set(range(num_iters - num_iters_combine + 1,
                                       num_iters + 1))
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index e95de336586..9a856bc6fe1 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -748,7 +748,8 @@ def check_configs(self):
         if self.config['target-rms'] < 0.0:
             raise RuntimeError("target-rms has invalid value {0}"
                                .format(self.config['target-rms']))
-        if self.config['learning-rate-factor'] <= 0.0:
+        if (self.config['learning-rate-factor'] != '' and
+            self.config['learning-rate-factor'] <= 0.0):
             raise RuntimeError("learning-rate-factor has invalid value {0}"
                                .format(self.config['learning-rate-factor']))
 
diff --git a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh
index 1dbcbe1a192..049e15df303 100755
--- a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh
+++ b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh
@@ -57,12 +57,9 @@ fi
 oldlm=$oldlang/G.fst
 if [ -f $oldlang/G.carpa ]; then
   oldlm=$oldlang/G.carpa
-elif [ ! -f $oldlm ]; then
-  echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\
-    exit 1;
 fi
 
-[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1;
+[ ! -f $oldlm ] && echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" && exit 1;
 [ ! -f $rnnlm_dir/rnnlm ] && echo "$0: Missing file $rnnlm_dir/rnnlm" && exit 1;
 [ ! -f $rnnlm_dir/unk.probs ] &&\
   echo "$0: Missing file $rnnlm_dir/unk.probs" && exit 1;
diff --git a/egs/wsj/s5/steps/nnet3/decode_score_fusion.sh b/egs/wsj/s5/steps/nnet3/decode_score_fusion.sh
index 2fcc4a1944d..dd0eeeddddd 100755
--- a/egs/wsj/s5/steps/nnet3/decode_score_fusion.sh
+++ b/egs/wsj/s5/steps/nnet3/decode_score_fusion.sh
@@ -76,8 +76,8 @@ write_compact=true   # If set to false, then writes the lattice in non-compact f
 
 if [ $# -lt 5 ]; then
   echo "Usage: $0 [options] <data-dir> <graph-dir> <nnet3-dir> <nnet3-dir2> [<nnet3-dir3> ... ] <output-dir>"
-  echo "e.g.:   local/socal/score_fusion.sh --nj 8 \\"
-  echo "--online-ivector-dir exp/nnet3/ivectors_test_eval92 \\"
+  echo "e.g.:   steps/nnet3/decode_score_fusion.sh --nj 8 \\"
+  echo "    --online-ivector-dir exp/nnet3/ivectors_test_eval92 \\"
   echo "    data/test_eval92_hires exp/nnet3/tdnn/graph exp/nnet3/tdnn/output exp/nnet3/tdnn1/output .. \\"
   echo "    exp/nnet3/tdnn_comb/decode_dev"
   echo "main options (for others, see top of script file)"
@@ -116,9 +116,6 @@ if [ $frame_subsampling_factor -ne 1 ]; then
   frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
 fi
 
-# convert $dir to absolute pathname
-fdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`
-
 # Possibly use multi-threaded decoder
 thread_string=
 [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
index 2e368283ed4..784693ee44c 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
@@ -130,8 +130,8 @@ if ! [ $num_utts -gt $[$num_utts_subset_valid*4] ]; then
 fi
 
 # Get list of validation utterances.
-awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset_valid | sort \
-    > $dir/valid_uttlist || exit 1;
+awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset_valid | sort \
+    > $dir/valid_uttlist
 
 if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
   echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
@@ -145,7 +145,7 @@ if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
 fi
 
 awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
-   utils/shuffle_list.pl | head -$num_utts_subset_train | sort > $dir/train_subset_uttlist || exit 1;
+   utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset_train | sort > $dir/train_subset_uttlist
 
 ## Set up features.
 echo "$0: feature type is raw"
diff --git a/egs/wsj/s5/steps/train_sat_basis.sh b/egs/wsj/s5/steps/train_sat_basis.sh
index 45384fe4ecd..5245ea0c619 100755
--- a/egs/wsj/s5/steps/train_sat_basis.sh
+++ b/egs/wsj/s5/steps/train_sat_basis.sh
@@ -17,6 +17,7 @@ scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
+basis_fmllr_opts="--fmllr-min-count=22  --num-iters=10 --size-scale=0.2 --step-size-iters=3"
 context_opts=  # e.g. set this to "--context-width 5 --central-position 2" for quinphone.
 realign_iters="10 20 30";
 fmllr_iters="2 4 6 12";
@@ -93,7 +94,7 @@ esac
 ## Get initial fMLLR transforms (possibly from alignment dir)
 if [ -f $alidir/trans.1 ]; then
   echo "$0: Using transforms from $alidir"
-  feats="$sifeats transform-feats ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
+  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
   cur_trans_dir=$alidir
 else
   if [ $stage -le -5 ]; then
@@ -114,13 +115,11 @@ else
       ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:-  \| \
       weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- \| \
       gmm-post-to-gpost $alidir/final.mdl "$sifeats" ark:- ark:- \| \
-      gmm-est-basis-fmllr-gpost --fmllr-min-count=22  --num-iters=10 \
-      --size-scale=0.2 --step-size-iters=3 \
-      --write-weights=ark:$dir/pre_wgt.JOB \
+      gmm-est-basis-fmllr-gpost $basis_fmllr_opts --spk2utt=ark:$sdata/JOB/spk2utt  \
       $alidir/final.mdl $alidir/fmllr.basis "$sifeats"  ark,s,cs:- \
       ark:$alidir/trans.JOB || exit 1;
 
-    feats="$sifeats transform-feats ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
+    feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
     cur_trans_dir=$alidir
   fi
 fi
@@ -214,14 +213,12 @@ while [ $x -lt $num_iters ]; do
         ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
         weight-silence-post $silence_weight $silphonelist $dir/$x.mdl ark:- ark:- \| \
         gmm-post-to-gpost $dir/$x.mdl "$sifeats" ark:- ark:- \| \
-        gmm-est-basis-fmllr-gpost --fmllr-min-count=22  --num-iters=10 \
-          --size-scale=0.2 --step-size-iters=3 \
-          --write-weights=ark:$dir/pre_wgt.JOB \
+        gmm-est-basis-fmllr-gpost $basis_fmllr_opts --spk2utt=ark:$sdata/JOB/spk2utt \
           $dir/$x.mdl $dir/fmllr.basis "$sifeats"  ark,s,cs:- \
           ark:$dir/trans.JOB || exit 1;
 
     fi
-    feats="$sifeats transform-feats ark:$dir/trans.JOB ark:- ark:- |"
+    feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
     cur_trans_dir=$dir
   fi
 
diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh
index 103a4173dc0..ca0972ca85b 100755
--- a/egs/wsj/s5/utils/fix_data_dir.sh
+++ b/egs/wsj/s5/utils/fix_data_dir.sh
@@ -6,6 +6,8 @@
 # It puts the original contents of data-dir into
 # data-dir/.backup
 
+cmd="$@"
+
 utt_extra_files=
 spk_extra_files=
 
@@ -21,6 +23,12 @@ if [ $# != 1 ]; then
 fi
 
 data=$1
+
+if [ -f $data/images.scp ]; then
+  image/fix_data_dir.sh $cmd
+  exit $?
+fi
+
 mkdir -p $data/.backup
 
 [ ! -d $data ] && echo "$0: no such directory $data" && exit 1;
diff --git a/egs/wsj/s5/utils/lang/limit_arpa_unk_history.py b/egs/wsj/s5/utils/lang/limit_arpa_unk_history.py
index 81c0df36d2b..f7e0dcbdc5f 100755
--- a/egs/wsj/s5/utils/lang/limit_arpa_unk_history.py
+++ b/egs/wsj/s5/utils/lang/limit_arpa_unk_history.py
@@ -77,6 +77,10 @@ def find_and_replace_unks(old_lm_lines, max_ngrams, skip_rows):
             if "\{}-grams:".format(max_ngrams) in line:
                 last_ngram = True
 
+            for i in range(max_ngrams):
+                if "\{}-grams:".format(i+1) in line:
+                    ngram = i+1
+
             # remove any n-gram states of the form: foo <unk> -> X
             # that is, any n-grams of order > 2 where <unk>
             # is the second-to-last word
@@ -85,7 +89,6 @@ def find_and_replace_unks(old_lm_lines, max_ngrams, skip_rows):
             if passed_2grams:
                 g_unk = unk_pattern.search(line)
                 if g_unk:
-                    ngram = len(g_unk.group(0).split()) - 1
                     ngram_diffs[ngram] = ngram_diffs[ngram] - 1
                     unk_row_count += 1
                     continue
diff --git a/egs/wsj/s5/utils/parallel/retry.pl b/egs/wsj/s5/utils/parallel/retry.pl
index a039d6f5a74..618e9fb01bc 100755
--- a/egs/wsj/s5/utils/parallel/retry.pl
+++ b/egs/wsj/s5/utils/parallel/retry.pl
@@ -94,7 +94,6 @@ sub get_log_file {
       # Later on we might want to figure out which array jobs failed
       # and have to be rerun, but for now we just die.
       print STDERR "$0: job failed and log file $log_file does not exist (array job?).\n";
-      exit($return_status)
     } else {
       rename($log_file, $log_file . ".bak");
       print STDERR "$0: job failed; renaming log file to ${log_file}.bak and rerunning\n";
diff --git a/egs/wsj/s5/utils/subset_data_dir.sh b/egs/wsj/s5/utils/subset_data_dir.sh
index ba52d140ccc..93ee0971b88 100755
--- a/egs/wsj/s5/utils/subset_data_dir.sh
+++ b/egs/wsj/s5/utils/subset_data_dir.sh
@@ -124,8 +124,10 @@ function do_filtering {
      [ -f $srcdir/reco2file_and_channel ] && \
        utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
 
-     # Filter the STM file for proper sclite scoring (this will also remove the comments lines)
-     [ -f $srcdir/stm ] && utils/filter_scp.pl $destdir/reco < $srcdir/stm > $destdir/stm
+     # Filter the STM file for proper sclite scoring
+     # Copy over the comments from STM file
+     [ -f $srcdir/stm ] && grep "^;;" $srcdir/stm > $destdir/stm
+     [ -f $srcdir/stm ] && utils/filter_scp.pl $destdir/reco < $srcdir/stm >> $destdir/stm
 
      rm $destdir/reco
   else
diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh
index 453ad6935f6..a8b0542c1bb 100755
--- a/egs/wsj/s5/utils/validate_data_dir.sh
+++ b/egs/wsj/s5/utils/validate_data_dir.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 
+cmd="$@"
 
 no_feats=false
 no_wav=false
@@ -44,6 +45,12 @@ if [ ! -d $data ]; then
   exit 1;
 fi
 
+if [ -f $data/images.scp ]; then
+  cmd=${cmd/--no-wav/}  # remove --no-wav if supplied
+  image/validate_data_dir.sh $cmd
+  exit $?
+fi
+
 for f in spk2utt utt2spk; do
   if [ ! -f $data/$f ]; then
     echo "$0: no such file $f"
diff --git a/misc/docker/centos/Dockerfile b/misc/docker/centos/Dockerfile
index 304951fa4e0..27fe31c0566 100644
--- a/misc/docker/centos/Dockerfile
+++ b/misc/docker/centos/Dockerfile
@@ -7,7 +7,7 @@ ENV CPU_CORE 4
 RUN yum update -y 
 RUN yum groupinstall -y "C Development Tools and Libraries" "Development Tools" "System Tools"
 RUN  yum install -y \
-    git bzip2 wget subversion which \
+    git bzip2 wget subversion which sox \
     gcc-c++ make automake autoconf zlib-devel atlas-static \
 	 python
 
diff --git a/misc/docker/fedora/Dockerfile b/misc/docker/fedora/Dockerfile
index 68f2d9504c7..4e30f8e66bf 100644
--- a/misc/docker/fedora/Dockerfile
+++ b/misc/docker/fedora/Dockerfile
@@ -7,8 +7,8 @@ ENV CPU_CORE 4
 RUN yum update -y
 RUN yum groupinstall -y "C Development Tools and Libraries" "Development Tools"
 RUN  yum install -y \
-    git bzip2 wget subversion \
-    gcc-c++ make automake autoconf zlib-devel \
+    git bzip2 wget subversion sox \
+    gcc-c++ make automake autoconf zlib-devel atlas-static \
     python python3
 
 
diff --git a/misc/docker/ubuntu-cuda/Dockerfile b/misc/docker/ubuntu-cuda/Dockerfile
index f61d4403355..f6225ee12ed 100644
--- a/misc/docker/ubuntu-cuda/Dockerfile
+++ b/misc/docker/ubuntu-cuda/Dockerfile
@@ -7,7 +7,7 @@ ENV CPU_CORE 4
 RUN \
   apt-get update -qq && \
   apt-get install -y \
-    git bzip2 wget \
+    git bzip2 wget sox \
     g++ make python python3 \
     zlib1g-dev automake autoconf libtool subversion \
     libatlas-base-dev
diff --git a/misc/docker/ubuntu/Dockerfile b/misc/docker/ubuntu/Dockerfile
index 6e2bc5def92..a87330b30fc 100644
--- a/misc/docker/ubuntu/Dockerfile
+++ b/misc/docker/ubuntu/Dockerfile
@@ -7,7 +7,7 @@ ENV CPU_CORE 4
 RUN \
   apt-get update -qq && \
   apt-get install -y \
-    git bzip2 wget \
+    git bzip2 wget sox \
     g++ make python python3 \
     zlib1g-dev automake autoconf libtool subversion \
     libatlas-base-dev
diff --git a/scripts/rnnlm/choose_features.py b/scripts/rnnlm/choose_features.py
index 0686c8f88c6..842cafb3c97 100755
--- a/scripts/rnnlm/choose_features.py
+++ b/scripts/rnnlm/choose_features.py
@@ -11,7 +11,7 @@
 sys.stdout = open(1, 'w', encoding='latin-1', closefd=False)
 
 import re
-tab_or_space = re.compile('[ \t]')
+tab_or_space = re.compile('[ \t]+')
 
 parser = argparse.ArgumentParser(description="This script chooses the sparse feature representation of words. "
                                              "To be more specific, it chooses the set of features-- you compute "
diff --git a/scripts/rnnlm/get_special_symbol_opts.py b/scripts/rnnlm/get_special_symbol_opts.py
index 83f7d708a49..4310b116ad7 100755
--- a/scripts/rnnlm/get_special_symbol_opts.py
+++ b/scripts/rnnlm/get_special_symbol_opts.py
@@ -9,7 +9,7 @@
 import sys
 
 import re
-tab_or_space = re.compile('[ \t]')
+tab_or_space = re.compile('[ \t]+')
 
 parser = argparse.ArgumentParser(description="This script checks whether the special symbols "
                                  "appear in words.txt with expected values, if not, it will "
diff --git a/scripts/rnnlm/get_unigram_probs.py b/scripts/rnnlm/get_unigram_probs.py
index abb8515f330..ab3f9bb382f 100755
--- a/scripts/rnnlm/get_unigram_probs.py
+++ b/scripts/rnnlm/get_unigram_probs.py
@@ -8,7 +8,7 @@
 import sys
 
 import re
-tab_or_space = re.compile('[ \t]')
+tab_or_space = re.compile('[ \t]+')
 
 parser = argparse.ArgumentParser(description="This script gets the unigram probabilities of words.",
                                  epilog="E.g. " + sys.argv[0] + " --vocab-file=data/rnnlm/vocab/words.txt "
diff --git a/scripts/rnnlm/get_vocab.py b/scripts/rnnlm/get_vocab.py
index e30ce4a94c9..5036db0ed2a 100755
--- a/scripts/rnnlm/get_vocab.py
+++ b/scripts/rnnlm/get_vocab.py
@@ -9,7 +9,7 @@
 sys.stdout = open(1, 'w', encoding='latin-1', closefd=False)
 
 import re
-tab_or_space = re.compile('[ \t]')
+tab_or_space = re.compile('[ \t]+')
 
 parser = argparse.ArgumentParser(description="This script get a vocab from unigram counts "
                                  "of words produced by get_unigram_counts.sh",
diff --git a/scripts/rnnlm/get_word_features.py b/scripts/rnnlm/get_word_features.py
index 54d84077060..aeb7a3ec6ae 100755
--- a/scripts/rnnlm/get_word_features.py
+++ b/scripts/rnnlm/get_word_features.py
@@ -10,7 +10,7 @@
 from collections import defaultdict
 
 import re
-tab_or_space = re.compile('[ \t]')
+tab_or_space = re.compile('[ \t]+')
 
 parser = argparse.ArgumentParser(description="This script turns the words into the sparse feature representation, "
                                              "using features from rnnlm/choose_features.py.",
diff --git a/scripts/rnnlm/prepare_rnnlm_dir.sh b/scripts/rnnlm/prepare_rnnlm_dir.sh
index 1de91bb7232..e6016701916 100755
--- a/scripts/rnnlm/prepare_rnnlm_dir.sh
+++ b/scripts/rnnlm/prepare_rnnlm_dir.sh
@@ -23,7 +23,7 @@ if [ $# != 3 ]; then
   echo "Usage: $0 [options] <text-dir> <rnnlm-config-dir> <rnnlm-dir>"
   echo "Sets up the directory <rnnlm-dir> for RNNLM training as done by"
   echo "rnnlm/train_rnnlm.sh, and initializes the model."
-  echo " <text-dir> is as validated by rnnlm/validate_data_dir.py"
+  echo " <text-dir> is as validated by rnnlm/validate_text_dir.py"
   echo " <rnnlm-config-dir> is as validated by rnnlm/validate_config_dir.sh."
   exit 1
 fi
diff --git a/scripts/rnnlm/prepare_split_data.py b/scripts/rnnlm/prepare_split_data.py
index e39f4504f37..cceac48313e 100755
--- a/scripts/rnnlm/prepare_split_data.py
+++ b/scripts/rnnlm/prepare_split_data.py
@@ -9,7 +9,7 @@
 import sys
 
 import re
-tab_or_space = re.compile('[ \t]')
+tab_or_space = re.compile('[ \t]+')
 
 parser = argparse.ArgumentParser(description="This script prepares files containing integerized text, "
                                  "for consumption by nnet3-get-egs.",
diff --git a/scripts/rnnlm/show_word_features.py b/scripts/rnnlm/show_word_features.py
index 5fe049cb8ce..53d4729b4bb 100755
--- a/scripts/rnnlm/show_word_features.py
+++ b/scripts/rnnlm/show_word_features.py
@@ -9,7 +9,7 @@
 sys.stdout = open(1, 'w', encoding='latin-1', closefd=False)
 
 import re
-tab_or_space = re.compile('[ \t]')
+tab_or_space = re.compile('[ \t]+')
 
 parser = argparse.ArgumentParser(description="This script turns the word features to a human readable format.",
                                  epilog="E.g. " + sys.argv[0] + "exp/rnnlm/word_feats.txt exp/rnnlm/features.txt "
diff --git a/scripts/rnnlm/validate_features.py b/scripts/rnnlm/validate_features.py
index 010ceb72615..2a077da4758 100755
--- a/scripts/rnnlm/validate_features.py
+++ b/scripts/rnnlm/validate_features.py
@@ -8,7 +8,7 @@
 import sys
 
 import re
-tab_or_space = re.compile('[ \t]')
+tab_or_space = re.compile('[ \t]+')
 
 parser = argparse.ArgumentParser(description="Validates features file, produced by rnnlm/choose_features.py.",
                                  epilog="E.g. " + sys.argv[0] + " exp/rnnlm/features.txt",
diff --git a/scripts/rnnlm/validate_text_dir.py b/scripts/rnnlm/validate_text_dir.py
index 4b311a8abbd..903e720bdf4 100755
--- a/scripts/rnnlm/validate_text_dir.py
+++ b/scripts/rnnlm/validate_text_dir.py
@@ -8,7 +8,7 @@
 import sys
 
 import re
-tab_or_space = re.compile('[ \t]')
+tab_or_space = re.compile('[ \t]+')
 
 parser = argparse.ArgumentParser(description="Validates data directory containing text "
                                  "files from one or more data sources, including dev.txt.",
diff --git a/scripts/rnnlm/validate_word_features.py b/scripts/rnnlm/validate_word_features.py
index f8eb5858d95..205b934ae1b 100755
--- a/scripts/rnnlm/validate_word_features.py
+++ b/scripts/rnnlm/validate_word_features.py
@@ -8,7 +8,7 @@
 import sys
 
 import re
-tab_or_space = re.compile('[ \t]')
+tab_or_space = re.compile('[ \t]+')
 
 parser = argparse.ArgumentParser(description="Validates word features file, produced by rnnlm/get_word_features.py.",
                                  epilog="E.g. " + sys.argv[0] + " --features-file=exp/rnnlm/features.txt "
diff --git a/src/bin/acc-lda.cc b/src/bin/acc-lda.cc
index 92cd192b9a6..b664135bdc7 100644
--- a/src/bin/acc-lda.cc
+++ b/src/bin/acc-lda.cc
@@ -37,7 +37,7 @@ int main(int argc, char *argv[]) {
         "Accumulate LDA statistics based on pdf-ids.\n"
         "Usage:  acc-lda [options] <transition-gmm/model> <features-rspecifier> <posteriors-rspecifier> <lda-acc-out>\n"
         "Typical usage:\n"
-        " ali-to-post ark:1.ali ark:- | lda-acc 1.mdl \"ark:splice-feats scp:train.scp|\"  ark:- ldaacc.1\n";
+        " ali-to-post ark:1.ali ark:- | acc-lda 1.mdl \"ark:splice-feats scp:train.scp|\"  ark:- ldaacc.1\n";
 
     bool binary = true;
     BaseFloat rand_prune = 0.0;
@@ -126,5 +126,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/chainbin/nnet3-chain-train.cc b/src/chainbin/nnet3-chain-train.cc
index 9ea7ba1b06f..536669a17d3 100644
--- a/src/chainbin/nnet3-chain-train.cc
+++ b/src/chainbin/nnet3-chain-train.cc
@@ -20,6 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "nnet3/nnet-chain-training.h"
+#include "cudamatrix/cu-allocator.h"
 
 
 int main(int argc, char *argv[]) {
@@ -52,6 +53,7 @@ int main(int argc, char *argv[]) {
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
 
     opts.Register(&po);
+    RegisterCuAllocatorOptions(&po);
 
     po.Read(argc, argv);
 
diff --git a/src/configure b/src/configure
index 2f506b9073c..41262259165 100755
--- a/src/configure
+++ b/src/configure
@@ -42,7 +42,7 @@
 
 # This should be incremented after any significant change to the configure
 # script, i.e. any change affecting kaldi.mk or the build system as a whole.
-CONFIGURE_VERSION=7
+CONFIGURE_VERSION=8
 
 if ! [ -x "$PWD/configure" ]; then
   echo 'You must run "configure" from the src/ directory.'
@@ -367,7 +367,7 @@ function linux_configure_mkl_threading {
 function configure_cuda {
   # Check for CUDA toolkit in the system
   if [ ! -d  "$CUDATKDIR" ]; then
-    for base in /Developer/NVIDIA/CUDA-6.0 /usr/local/share/cuda /usr/local/cuda /pkgs_local/cuda-3.2/ /opt/nvidia_cuda/cuda-6.0/ /usr/; do
+    for base in /usr/local/share/cuda /usr/local/cuda /usr/; do
       if [ -f $base/bin/nvcc ]; then
         CUDATKDIR=$base
       fi
@@ -395,14 +395,6 @@ function configure_cuda {
       GCC_VER=$($COMPILER -dumpversion)
       GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
       case $CUDA_VERSION in
-        5_5)
-          MIN_UNSUPPORTED_GCC_VER="5.0"
-          MIN_UNSUPPORTED_GCC_VER_NUM=50000;
-        ;;
-        6_*)
-          MIN_UNSUPPORTED_GCC_VER="5.0"
-          MIN_UNSUPPORTED_GCC_VER_NUM=50000;
-        ;;
         7_*)
           MIN_UNSUPPORTED_GCC_VER="5.0"
           MIN_UNSUPPORTED_GCC_VER_NUM=50000;
@@ -454,6 +446,8 @@ function configure_cuda {
       else
         cat makefiles/cuda_64bit.mk >> kaldi.mk
       fi
+    elif [ "`uname -m`" == "aarch64" ]; then
+      cat makefiles/cuda_64bit.mk >> kaldi.mk
     elif [ "`uname -m`" == "ppc64le" ]; then
       cat makefiles/cuda_64bit.mk >> kaldi.mk
     else
diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index fec75b01a3f..cfbc6757530 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -1,6 +1,6 @@
 // cudamatrix/cu-allocator.cc
 
-// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+// Copyright      2015-2018  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -19,6 +19,8 @@
 
 
 
+#include "cudamatrix/cu-allocator.h"
+
 #if HAVE_CUDA == 1
 
 #include <cublas_v2.h>
@@ -28,6 +30,10 @@
 #include <string>
 #include <vector>
 #include <algorithm>
+#ifndef _MSC_VER
+#include <dlfcn.h>
+#endif
+
 #include "cudamatrix/cu-common.h"
 #include "cudamatrix/cu-device.h"
 #include "cudamatrix/cu-matrix.h"
@@ -39,10 +45,207 @@ namespace kaldi {
 
 
 void* CuMemoryAllocator::Malloc(size_t size) {
-  // For now just call MallocPitch and throw away the pitch, to avoid
-  // duplicating code here.  Apparently the time difference is quite small.
-  size_t pitch;
-  return MallocPitch(size, 1, &pitch);
+  Timer tim;
+  if (!opts_.cache_memory) {
+    void *ans;
+    CU_SAFE_CALL(cudaMalloc(&ans, size));
+    double elapsed = tim.Elapsed();
+    tot_time_taken_ += elapsed;
+    malloc_time_taken_ += elapsed;
+    t_++;
+    return ans;
+  }
+
+  // We could perhaps change this to KALDI_PARANOID_ASSERT to save time.
+  KALDI_ASSERT(size != 0);
+
+  // Round up 'size' to a multiple of 256; this ensures the right kind of
+  // memory alignment.
+  size = (size + 255) & ~((size_t)255);
+  void *ans = MallocInternal(size);
+  tot_time_taken_ += tim.Elapsed();
+  return ans;
+}
+
+
+CuMemoryAllocator::MemoryBlock *CuMemoryAllocator::SplitBlock(
+    MemoryBlock *block, size_t size) {
+  SubRegion *subregion = block->subregion;
+  // new_block will become the right-most part of 'block', and 'block' will
+  // be the left-most part.
+  MemoryBlock *new_block = new MemoryBlock;
+  bool return_new_block;
+  char *new_begin;
+
+  // We now decide whether to make the left part of 'block' be of size ('size')
+  // and return it (the 'if' branch of the if-else block below), or the right
+  // part (the 'else' branch).  We decide this based on heuristics.  Basically,
+  // we want to allocate the sub-block that's either next to the edge of the
+  // MemoryRegion, or next to something that was allocated long ago (and which,
+  // we assume won't be deallocated for a relatively long time).  That is: we
+  // want to leave the un-allocated memory next to a memory block that was
+  // recently allocated (and thus is likely to be freed sooner), so that when
+  // that block is freed we can merge it with the still-unallocated piece into a
+  // larger block; this will reduce fragmentation.  But if this block spans
+  // multiple sub-regions we don't want to do that, as that would be against our
+  // heuristic of, where possible, allocating memory from lower-numbered
+  // sub-regions.
+  //
+  // Bear in mind that we can assume block->next and block->prev, if they are
+  // non-NULL, are both currently allocated, since 'block' is un-allocated and
+  // we would have merged any adjacent un-allocated sub-regions.
+  if (block->next != NULL && block->prev != NULL &&
+      block->prev->t < block->next->t &&
+      block->next->subregion == subregion) {
+    // We'll allocate the right part of the block, since the left side is next
+    // to a relatively recently-allocated block.
+    return_new_block = true;
+    new_begin = block->end - size;
+  } else {
+    // We'll allocate the left part of the block.
+    return_new_block = false;
+    new_begin = block->begin + size;
+  }
+
+  // The following code makes sure the SubRegion for 'new_block' is correct,
+  // i.e. its 'begin' is >= the 'begin' of the subregion and < the 'end' of the
+  // subregion.  If the following loop segfaults, it indicates a bug somewhere
+  // else.
+  while (new_begin >= subregion->end)
+    subregion = subregion->next;
+  MemoryBlock *next_block = block->next;
+  new_block->begin = new_begin;
+  new_block->end = block->end;
+  new_block->subregion = subregion;
+  new_block->allocated = false;
+  new_block->thread_id = block->thread_id;
+  new_block->t = block->t;
+  new_block->next = next_block;
+  new_block->prev = block;
+  if (next_block)
+    next_block->prev = new_block;
+  block->next = new_block;
+  block->end = new_begin;
+
+  // Add the split-up piece that we won't be allocating, to the
+  // 'free_blocks' member of its subregion.
+  if (return_new_block) {
+    AddToFreeBlocks(block);
+    return new_block;
+  } else {
+    AddToFreeBlocks(new_block);
+    return block;
+  }
+}
+
+
+void CuMemoryAllocator::RemoveFromFreeBlocks(MemoryBlock *block) {
+  SubRegion *subregion = block->subregion;
+  size_t block_size = block->end - block->begin;
+  std::pair<size_t, MemoryBlock*> p(block_size, block);
+  size_t num_removed = subregion->free_blocks.erase(p);
+  KALDI_ASSERT(num_removed != 0);
+  // Update largest_free_block_, if needed.
+  size_t subregion_index = subregion->subregion_index;
+  if (block_size == largest_free_block_[subregion_index]) {
+    if (subregion->free_blocks.empty())
+      largest_free_block_[subregion_index] = 0;
+    else
+      largest_free_block_[subregion_index] =
+          subregion->free_blocks.begin()->first;
+  }
+}
+
+void CuMemoryAllocator::AddToFreeBlocks(MemoryBlock *block) {
+  SubRegion *subregion = block->subregion;
+  KALDI_PARANOID_ASSERT(block->begin >= subregion->begin &&
+                        block->begin < subregion->end);
+  size_t block_size = block->end - block->begin,
+       subregion_index = subregion->subregion_index;
+  // Update largest_free_block_, if needed.
+  if (block_size > largest_free_block_[subregion_index]) {
+    largest_free_block_[subregion_index] = block_size;
+  }
+  subregion->free_blocks.insert(std::pair<size_t, MemoryBlock*>(block_size, block));
+}
+
+
+void* CuMemoryAllocator::MallocFromSubregion(SubRegion *subregion,
+                                             size_t size) {
+  // NULL is implementation defined and doesn't have to be zero so we can't
+  // guarantee that NULL will be <= a valid pointer-- so we cast to a pointer
+  // from zero instead of using NULL.
+  std::pair<size_t, MemoryBlock*> p(size, (MemoryBlock*)0);
+
+  std::set<std::pair<size_t, MemoryBlock*> >::iterator iter =
+      subregion->free_blocks.lower_bound(p);
+  // so now 'iter' is the first member of free_blocks whose size_t value is >=
+  // size.  If 'iter' was equal to the end() of that multi_map, it would be a
+  // bug because the calling code checked that the largest free block in this
+  // region was sufficiently large.  We don't check this; if it segfaults, we'll
+  // debug.
+
+  MemoryBlock *block = iter->second;
+  // Erase 'block' from its subregion's free blocks list... the next lines are
+  // similar to RemoveFromFreeBlocks(), but we code it directly as we have the
+  // iterator here, and it would be wasteful to do another lookup.
+  subregion->free_blocks.erase(iter);
+  // Update largest_free_block_, if needed.  The following few lines of code also appear
+  // in RemoveFromFreeBlocks().
+  size_t block_size = block->end - block->begin,
+      subregion_index = subregion->subregion_index;
+  if (block_size == largest_free_block_[subregion_index]) {
+    if (subregion->free_blocks.empty())
+      largest_free_block_[subregion_index] = 0;
+    else
+      largest_free_block_[subregion_index] =
+          subregion->free_blocks.begin()->first;
+  }
+
+  KALDI_PARANOID_ASSERT(block_size >= size && block->allocated == false);
+
+  // the most memory we allow to be 'wasted' by failing to split a block, is the
+  // smaller of: 1/16 of the size we're allocating, or half a megabyte.
+  size_t allowed_extra_size = std::min<size_t>(size >> 4, 524288);
+  if (block_size > size + allowed_extra_size) {
+    // If the requested block is substantially larger than what was requested,
+    // split it so we don't waste memory.
+    block = SplitBlock(block, size);
+  }
+
+  if (std::this_thread::get_id() != block->thread_id &&
+      block->t > synchronize_gpu_t_) {
+    // see NOTE ON SYNCHRONIZATION in the header.
+    SynchronizeGpu();
+    synchronize_gpu_t_ = t_;
+    num_synchronizations_++;
+  }
+  block->allocated = true;
+  block->t = t_;
+  allocated_block_map_[block->begin] = block;
+  return block->begin;
+}
+
+// By the time MallocInternal is called, we will have ensured that 'size' is
+// a nonzero multiple of 256 (for memory aligment reasons).
+// inline
+void* CuMemoryAllocator::MallocInternal(size_t size) {
+start:
+  std::vector<size_t>::const_iterator iter = largest_free_block_.begin(),
+      end = largest_free_block_.end();
+  size_t subregion_index = 0;
+  for (; iter != end; ++iter, ++subregion_index) {
+    if (*iter > size) {
+      return MallocFromSubregion(subregions_[subregion_index], size);
+    }
+  }
+  // We dropped off the loop without finding a subregion with enough memory
+  // to satisfy the request -> allocate a new region.
+  AllocateNewRegion(size);
+  // An infinite loop shouldn't be possible because after calling
+  // AllocateNewRegion(size), there should always be a SubRegion
+  // with that size available.
+  goto start;
 }
 
 // Returns max(0, floor(log_2(i))).   Not tested independently.
@@ -63,311 +266,341 @@ static inline size_t IntegerLog2(size_t i) {
   return ans;
 }
 
-//inline
-CuMemoryAllocator::MruCache& CuMemoryAllocator::GetCacheForSize(
-    size_t num_bytes) {
-  size_t bucket_index = IntegerLog2(num_bytes);
-  KALDI_ASSERT(num_bytes > 0 && bucket_index < caches_.size());
-  return caches_[bucket_index];
-}
-
-//inline
-void* CuMemoryAllocator::MallocPitchInternal(size_t row_bytes,
-                                             size_t num_rows,
-                                            size_t *pitch) {
-  num_system_allocations_++;
-  void *ans;
-  cudaError_t e;
-  for (int32 i = 0; i <= 2; i++) {
-    if (num_rows != 1) {
-      CuTimer tim;
-      e = cudaMallocPitch(&ans, pitch, row_bytes, num_rows);
-      tot_time_taken_in_cuda_malloc_pitch_ += tim.Elapsed();
+std::string GetFreeGpuMemory(int64* free, int64* total) {
+#ifdef _MSC_VER
+  size_t mem_free, mem_total;
+  cuMemGetInfo_v2(&mem_free, &mem_total);
+#else
+  // define the function signature type
+  size_t mem_free, mem_total;
+  {
+    // we will load cuMemGetInfo_v2 dynamically from libcuda.so
+    // pre-fill ``safe'' values that will not cause problems
+    mem_free = 1; mem_total = 1;
+    // open libcuda.so
+    void* libcuda = dlopen("libcuda.so", RTLD_LAZY);
+    if (NULL == libcuda) {
+      KALDI_WARN << "cannot open libcuda.so";
     } else {
-      CuTimer tim;
-      // we might save a little time this way.
-      e = cudaMalloc(&ans, row_bytes);
-      tot_time_taken_in_cuda_malloc_ += tim.Elapsed();
-      *pitch = row_bytes;
-    }
-    if (e != cudaSuccess) {
-      PrintMemoryUsage();
-      // On the first 2 out of the 3 iters, try freeing memory.
-      if (i <= 1) {
-        KALDI_WARN << "Allocation of " << row_bytes << " x "
-                   << num_rows << " region failed: freeing some memory and "
-                   << "trying again. ";
-        BaseFloat new_memory_factor = 1.1;
-        if (opts_.memory_factor > new_memory_factor) {
-          KALDI_LOG << "To avoid future problems like this, changing "
-                    << "memory_factor from " << opts_.memory_factor << " to "
-                    << new_memory_factor;
-          opts_.memory_factor = new_memory_factor;
-        }
-        size_t memory_cached = MemoryCached(),
-            memory_requested = row_bytes * num_rows,
-            memory_to_free = std::max<size_t>(memory_cached / 2,
-                                              std::min<size_t>(memory_cached,
-                                                               memory_requested));
-        FreeSomeCachedMemory(memory_to_free);
+      // define the function signature type
+      // and get the symbol
+      typedef CUresult (*cu_fun_ptr)(size_t*, size_t*);
+      cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2");
+      if (NULL == dl_cuMemGetInfo) {
+        KALDI_WARN << "cannot load cuMemGetInfo from libcuda.so";
       } else {
-        KALDI_ERR << "Cannot allocate the requested memory ("
-                  << row_bytes << " x " << num_rows << " = "
-                  << row_bytes * num_rows << " bytes)";
+        // call the function
+        dl_cuMemGetInfo(&mem_free, &mem_total);
       }
-      cudaGetLastError();  // Clear the error state.
-    } else {
-      break;
+      // close the library
+      dlclose(libcuda);
     }
   }
-  return ans;
+#endif
+  // copy the output values outside
+  if (NULL != free) *free = mem_free;
+  if (NULL != total) *total = mem_total;
+  // prepare the text output
+  std::ostringstream os;
+  os << "free:" << mem_free/(1024*1024) << "M, "
+     << "used:" << (mem_total-mem_free)/(1024*1024) << "M, "
+     << "total:" << mem_total/(1024*1024) << "M, "
+     << "free/total:" << mem_free/(float)mem_total;
+  return os.str();
 }
 
 void CuMemoryAllocator::PrintMemoryUsage() const {
-  KALDI_LOG << "Memory usage: " << cur_bytes_allocated_
-            << " bytes currently allocated (max: "
-            << max_bytes_allocated_ << "); " << cur_bytes_used_
-            << " currently in use by user (max: " << max_bytes_used_ << ")"
-            << "; " << num_system_allocations_ << '/'
-            << num_user_allocations_ << " calls to Malloc* resulted in "
-            << "CUDA calls.";
-  if (GetVerboseLevel() >= 1) {
-    // CuTimer only accumulates stats at verbose level 1 or above.
-    KALDI_LOG << "Time taken in cudaMallocPitch=" << tot_time_taken_in_cuda_malloc_pitch_
-              << ", in cudaMalloc=" << tot_time_taken_in_cuda_malloc_
-              << ", in cudaFree=" << tot_time_taken_in_cuda_free_
-              << ", in this->MallocPitch()=" << tot_time_taken_in_malloc_pitch_;
+  if (!opts_.cache_memory) {
+    KALDI_LOG << "Not caching allocations; time taken in "
+              << "malloc/free is " << malloc_time_taken_
+              << "/" << (tot_time_taken_ - malloc_time_taken_)
+              << ", num operations is " << t_
+              << "; device memory info: "
+              << GetFreeGpuMemory(NULL, NULL);
+    return;
   }
+
+  size_t num_blocks_allocated = 0, num_blocks_free = 0,
+      memory_allocated = 0, memory_held = 0,
+      largest_free_block = 0, largest_allocated_block = 0;
+
+  for (size_t i = 0; i < memory_regions_.size(); i++) {
+    MemoryBlock *m = memory_regions_[i].block_begin;
+    KALDI_ASSERT(m->begin == memory_regions_[i].begin);
+    for (; m != NULL; m = m->next) {
+      size_t size = m->end - m->begin;
+      if (m->allocated) {
+        num_blocks_allocated++;
+        memory_allocated += size;
+        if (size > largest_allocated_block)
+          largest_allocated_block = size;
+      } else {
+        num_blocks_free++;
+        if (size > largest_free_block)
+          largest_free_block = size;
+      }
+      memory_held += size;
+      // The following is just some sanity checks; this code is rarely called so
+      // it's a reasonable place to put them.
+      if (m->next) {
+        KALDI_ASSERT(m->next->prev == m && m->end == m->next->begin);
+      } else {
+        KALDI_ASSERT(m->end == memory_regions_[m->subregion->memory_region].end);
+      }
+    }
+  }
+  KALDI_LOG << "Memory usage: " << memory_allocated << "/"
+            << memory_held << " bytes currently allocated/total-held; "
+            << num_blocks_allocated << "/" << num_blocks_free
+            << " blocks currently allocated/free; largest "
+            << "free/allocated block sizes are "
+            << largest_allocated_block << "/" << largest_free_block
+            << "; time taken total/cudaMalloc is "
+            << tot_time_taken_ << "/" << malloc_time_taken_
+            << ", synchronized the GPU " << num_synchronizations_
+            << " times out of " << (t_/2) << " frees; "
+            << "device memory info: " << GetFreeGpuMemory(NULL, NULL);
+}
+
+// Note: we just initialize with the default options, but we can change it later
+// (as long as it's before we first use the class) by calling SetOptions().
+CuMemoryAllocator::CuMemoryAllocator():
+    opts_(CuAllocatorOptions()),
+    t_(0),
+    synchronize_gpu_t_(0),
+    num_synchronizations_(0),
+    tot_time_taken_(0.0),
+    malloc_time_taken_(0.0) {
+  // Note: we don't allocate any memory regions at the start; we wait for the user
+  // to call Malloc() or MallocPitch(), and then allocate one when needed.
 }
 
-CuMemoryAllocator::CuMemoryAllocator(CuAllocatorOptions opts):
-    opts_(opts),
-    caches_(40),
-    cur_bytes_allocated_(0),
-    max_bytes_allocated_(0),
-    cur_bytes_used_(0),
-    max_bytes_used_(0),
-    t_(1),
-    num_user_allocations_(0),
-    num_system_allocations_(0),
-    tot_time_taken_in_cuda_malloc_(0.0),
-    tot_time_taken_in_cuda_malloc_pitch_(0.0),
-    tot_time_taken_in_cuda_free_(0.0),
-    tot_time_taken_in_malloc_pitch_(0.0) { }
 
 void* CuMemoryAllocator::MallocPitch(size_t row_bytes,
                                      size_t num_rows,
                                      size_t *pitch) {
-  CuTimer tim;
-  t_++;
-  num_user_allocations_++;
-  size_t requested_bytes = row_bytes * num_rows;
-  if (cur_bytes_used_ + requested_bytes > max_bytes_used_)
-    max_bytes_used_ = cur_bytes_used_ + requested_bytes;
-  MruCache &cache = GetCacheForSize(requested_bytes);
-  MemoryRequest request(row_bytes, num_rows);
-  CachedMemoryElement output;
-  if (cache.Lookup(request, &output)) {
-    // we have cached memory with this value.
-    void *ans = output.pointer;
-    *pitch = output.pitch;
-    used_map_[ans] = UsedMemoryElement(row_bytes, num_rows, output.pitch);
-    cur_bytes_used_ += requested_bytes;
-    tot_time_taken_in_malloc_pitch_ += tim.Elapsed();
-    return ans;
-  } else {
-    // note: it's important that we already updated max_bytes_used_.
-    size_t next_bytes_allocated = cur_bytes_allocated_ + requested_bytes,
-        max_bytes_to_allocate =
-        static_cast<size_t>(opts_.memory_factor * max_bytes_used_);
-    ssize_t bytes_overflow = next_bytes_allocated - max_bytes_to_allocate;
-    if (bytes_overflow > 0) {
-      // The amount we would have allocated, after fulfilling this request,
-      // would exceed our limits (we don't allow ourselves to allocate more than
-      // memory_factor times the maximum amount of memory the user ever owns
-      // during the lifetime of the program).  So free some memory.
-      KALDI_ASSERT(bytes_overflow <= MemoryCached());  // sanity check.
-      FreeSomeCachedMemory(static_cast<size_t>(bytes_overflow));
-      KALDI_ASSERT(cur_bytes_allocated_ + requested_bytes <=
-                   max_bytes_to_allocate);
-    }
-    void *ans = MallocPitchInternal(row_bytes, num_rows, pitch);
-    cur_bytes_allocated_ += requested_bytes;
-    if (cur_bytes_allocated_ > max_bytes_allocated_)
-      max_bytes_allocated_ = cur_bytes_allocated_;
-    used_map_[ans] = UsedMemoryElement(row_bytes, num_rows, *pitch);
-    cur_bytes_used_ += requested_bytes;
-    tot_time_taken_in_malloc_pitch_ += tim.Elapsed();
+  Timer tim;
+  if (!opts_.cache_memory) {
+    void *ans;
+    CU_SAFE_CALL(cudaMallocPitch(&ans, pitch, row_bytes, num_rows));
+    double elapsed = tim.Elapsed();
+    tot_time_taken_ += elapsed;
+    malloc_time_taken_ += elapsed;
     return ans;
   }
-}
 
-void CuMemoryAllocator::FreeSomeCachedMemory(size_t bytes_to_free_in) {
-  CuTimer tim;
-  // the next few lines are responsible for increasing the amount of memory we
-  // are going to free, in case the user requested an amount that's very tiny
-  // compared with the total amount of memory ever used.  This helps us
-  // to amortize the cost of visiting all of the buckets inside this code.
-  // (there are only 40 buckets so it's not so big, but we're being careful.
-  size_t bytes_cached = cur_bytes_allocated_ - cur_bytes_used_,
-      min_to_free = static_cast<size_t>(max_bytes_used_ * opts_.delete_factor);
-  size_t bytes_to_free = std::min(bytes_cached,
-                                  std::max(bytes_to_free_in, min_to_free)),
-      bytes_freed = 0;
-
-  size_t num_caches = caches_.size(),
-      t = t_;
-  // size_factor contains the approximate (power-of-two) size of the pointers
-  // that each cache's pointers contain.  The 'cost' of keeping any given pointer,
-  // we declare to be the time since we last used it multiplied by the size
-  // of the memory in the pointer.
-  std::vector<BaseFloat> size_factor(num_caches);
-  for (size_t i = 0, j=1; i < num_caches; i++, j *= 2)
-    size_factor[i] = j;
-
-  std::priority_queue<std::pair<BaseFloat,int32> > queue;
-  // Set up the queue.
-  for (int32 i = 0; i < num_caches; i++) {
-    const MruCache &cache = caches_[i];
-    size_t cache_t = cache.LeastRecentTime();
-    if (cache_t > 0) {  // t == 0 means the cache is empty.
-      size_t interval = t - cache_t;
-      BaseFloat cost = size_factor[i] * interval;
-      KALDI_ASSERT(interval > 0);
-      queue.push(std::pair<BaseFloat,int32>(cost, i));
-    }
-  }
-  while (bytes_freed < bytes_to_free) {
-    // If the following fails it means I made some kind of bookkeeping error,
-    // and most likely we are trying to free more memory than we really have
-    // cached.
-    KALDI_ASSERT(!queue.empty() && "Code error.");
-    std::pair<BaseFloat, int32> p = queue.top();
-    int32 cache_index = p.second;
-    MruCache &cache = caches_[cache_index];
-    queue.pop();
-    if (queue.empty()) {
-      while (bytes_freed < bytes_to_free) {
-        bytes_freed += cache.RemoveLeastRecentlyUsed();
-      }
-    } else {
-      BaseFloat next_worst_cost = queue.top().first;
-      while (1)  {
-        bytes_freed += cache.RemoveLeastRecentlyUsed();
-        if (bytes_freed >= bytes_to_free)
-          break;
-        size_t least_recent_time = cache.LeastRecentTime();
-        if (least_recent_time == 0)  // this cache is now empty
-          break;
-        size_t interval = t - least_recent_time;
-        KALDI_ASSERT(interval > 0);
-        BaseFloat cost = size_factor[cache_index] * interval;
-        if (cost < next_worst_cost) {
-          // There is another bucket that has worse cost than this,
-          // so stop processing this bucket-- but first put it
-          // back in the queue.
-          queue.push(std::pair<BaseFloat, int32>(cost, cache_index));
-          break;
-        }
-      }
-    }
-  }
-  KALDI_ASSERT(bytes_freed <= cur_bytes_allocated_);
-  cur_bytes_allocated_ -= bytes_freed;
-  tot_time_taken_in_cuda_free_ += tim.Elapsed();
+  // Round up row_bytes to a multiple of 256.
+  row_bytes = (row_bytes + 255) & ~((size_t)255);
+  *pitch = row_bytes;
+  void *ans = MallocInternal(row_bytes * num_rows);
+  tot_time_taken_ += tim.Elapsed();
+  return ans;
 }
 
 void CuMemoryAllocator::Free(void *ptr) {
+  CuTimer tim;
+  if (!opts_.cache_memory) {
+    CU_SAFE_CALL(cudaFree(ptr));
+    tot_time_taken_ += tim.Elapsed();
+    t_++;
+    return;
+  }
   t_++;
-  unordered_map<void*, UsedMemoryElement, PointerHasher>::iterator iter =
-      used_map_.find(ptr);
-  if (iter == used_map_.end()) {
+  unordered_map<void*, MemoryBlock*>::iterator iter =
+      allocated_block_map_.find(ptr);
+  if (iter == allocated_block_map_.end()) {
     KALDI_ERR << "Attempt to free CUDA memory pointer that was not allocated: "
               << ptr;
   }
-  const UsedMemoryElement &elem = iter->second;
-  size_t num_bytes = elem.row_bytes * elem.num_rows;
-
-  cur_bytes_used_ -= num_bytes;
-  MruCache &cache = GetCacheForSize(num_bytes);
+  MemoryBlock *block = iter->second;
+  allocated_block_map_.erase(iter);
+  block->t = t_;
+  block->thread_id = std::this_thread::get_id();
+  block->allocated = false;
+
+  // If this is not the first block of the memory region and the previous block
+  // is not allocated, merge this block into the previous block.
+  MemoryBlock *prev_block = block->prev;
+  if (prev_block != NULL && !prev_block->allocated) {
+    RemoveFromFreeBlocks(prev_block);
+    prev_block->end = block->end;
+    if (prev_block->thread_id != block->thread_id) {
+      // the two blocks we're merging were freed by different threads, so we
+      // give the 'nonexistent thread' as their thread, which means that
+      // whichever thread requests that block, we force synchronization.  We can
+      // assume that prev_block was previously allocated (prev_block->t > 0)
+      // because we always start from the left when allocating blocks, and we
+      // know that this block was previously allocated.
+      prev_block->thread_id = std::thread::id();
+    }
+    prev_block->t = t_;
+    prev_block->next = block->next;
+    if (block->next)
+      block->next->prev = prev_block;
+    delete block;
+    block = prev_block;
+  }
 
-  cache.Insert(MemoryRequest(elem.row_bytes, elem.num_rows),
-               CachedMemoryElement(ptr, t_, elem.pitch));
-  used_map_.erase(iter);
+  // If this is not the last block of the memory region and the next block is
+  // not allocated, merge the next block into this block.
+  MemoryBlock *next_block = block->next;
+  if (next_block != NULL && !next_block->allocated) {
+    // merge next_block into 'block', deleting 'next_block'.  Note: at this
+    // point, if we merged with the previous block, the variable 'block' may now
+    // be pointing to that previous block, so it would be a 3-way merge.
+    RemoveFromFreeBlocks(next_block);
+    block->end = next_block->end;
+    if (next_block->thread_id != block->thread_id && next_block->t > 0) {
+      // the two blocks we're merging were freed by different threads, so we
+      // give the 'nonexistent thread' as their thread, which means that
+      // whichever thread requests that block, we force synchronization.  there
+      // is no need to do this if next_block->t == 0, which would mean it had
+      // never been allocated.
+      block->thread_id = std::thread::id();
+    }
+    // We don't need to inspect the 't' value of next_block; it can't be
+    // larger than t_ because t_ is now.
+    block->next = next_block->next;
+    if (block->next)
+      block->next->prev = block;
+    delete next_block;
+  }
+  AddToFreeBlocks(block);
+  tot_time_taken_ += tim.Elapsed();
 }
 
-size_t CuMemoryAllocator::MruCache::LeastRecentTime() const {
-  if (list_.empty()) {
-    KALDI_PARANOID_ASSERT(map_.empty());
-    return 0;
-  } else {
-    const MemoryRequest &mr = list_.front();
-    MapType::const_iterator iter = map_.find(mr);
-    KALDI_ASSERT(iter != map_.end());
-    const MapValueType &queue = iter->second;
-    KALDI_ASSERT(!queue.empty());
-    return queue.front().first.t;
+void CuMemoryAllocator::AllocateNewRegion(size_t size) {
+  int64 free_memory, total_memory;
+  std::string mem_info = GetFreeGpuMemory(&free_memory, &total_memory);
+  opts_.Check();
+  size_t region_size = static_cast<size_t>(free_memory * opts_.memory_proportion);
+  if (region_size < size)
+    region_size = size;
+  // Round up region_size to an exact multiple of 1M (note: we expect it will
+  // be much larger than that).  1048575 is 2^20 - 1.
+  region_size = (region_size + 1048575) & ~((size_t)1048575);
+
+  if (!memory_regions_.empty()) {
+    // If this is not the first region allocated, print some information.
+    KALDI_LOG << "About to allocate new memory region of " << region_size
+              << " bytes; current memory info is: " << mem_info;
+  }
+  void *memory_region;
+  cudaError_t e;
+  {
+    Timer tim;
+    e = cudaMalloc(&memory_region, region_size);
+    malloc_time_taken_ += tim.Elapsed();
+  }
+  if (e != cudaSuccess) {
+    PrintMemoryUsage();
+    if (!CuDevice::Instantiate().IsComputeExclusive()) {
+      KALDI_ERR << "Failed to allocate a memory region of " << region_size
+                << " bytes.  Possibly this is due to sharing the GPU.  Try "
+                << "switching the GPUs to exclusive mode (nvidia-smi -c 3) and using "
+                << "the option --use-gpu=wait to scripts like "
+                << "steps/nnet3/chain/train.py.  Memory info: "
+                << mem_info;
+    } else {
+      KALDI_ERR << "Failed to allocate a memory region of " << region_size
+                << " bytes.  Possibly smaller minibatch size would help.  "
+                << "Memory info: " << mem_info;
+    }
+  }
+  // this_num_subregions would be approximately 'opts_.num_subregions' if
+  // 'region_size' was all the device's memory.  (We add one to round up).
+  // We're aiming to get a number of sub-regions approximately equal to
+  // opts_.num_subregions by the time we allocate all the device's memory.
+  size_t this_num_subregions = 1 +
+      (region_size * opts_.num_subregions) / total_memory;
+
+  size_t memory_region_index = memory_regions_.size();
+  memory_regions_.resize(memory_region_index + 1);
+  MemoryRegion &this_region = memory_regions_.back();
+
+  this_region.begin = static_cast<char*>(memory_region);
+  this_region.end = this_region.begin + region_size;
+  // subregion_size will be hundreds of megabytes.
+  size_t subregion_size = region_size / this_num_subregions;
+
+  std::vector<SubRegion*> new_subregions;
+  char* subregion_begin = static_cast<char*>(memory_region);
+  for (size_t i = 0; i < this_num_subregions; i++) {
+    SubRegion *subregion = new SubRegion();
+    subregion->memory_region = memory_region_index;
+    subregion->begin = subregion_begin;
+    if (i + 1 == this_num_subregions) {
+      subregion->end = this_region.end;
+      KALDI_ASSERT(subregion->end > subregion->begin);
+    } else {
+      subregion->end = subregion_begin + subregion_size;
+      subregion_begin = subregion->end;
+    }
+    subregion->next = NULL;
+    if (i > 0) {
+      new_subregions.back()->next = subregion;
+    }
+    new_subregions.push_back(subregion);
   }
+  // Initially the memory is in a single block, owned by
+  // the first subregion.  It will be split up gradually.
+  MemoryBlock *block = new MemoryBlock();
+  block->begin = this_region.begin;
+  block->end = this_region.end;
+  block->subregion = new_subregions.front();
+  block->allocated = false;
+  block->t = 0; // was never allocated.
+  block->next = NULL;
+  block->prev = NULL;
+  for (size_t i = 0; i < this_num_subregions; i++)
+    subregions_.push_back(new_subregions[i]);
+  SortSubregions();
+  this_region.block_begin = block;
+
+  AddToFreeBlocks(block);
 }
 
-bool CuMemoryAllocator::MruCache::Lookup(const MemoryRequest &request,
-                                         CachedMemoryElement *output) {
-  MapType::iterator iter = map_.find(request);
-  if (iter == map_.end())
-    return false;
-  MapValueType &q = iter->second;
-  KALDI_ASSERT(!q.empty());
-  // use q.back() as we want to return the most recently used one if there
-  // is a choice.  We believe this will give better caching behavior.
-  *output = q.back().first;
-  list_.erase(q.back().second);
-  q.pop_back();
-  if (q.empty())
-    map_.erase(request);
-  return true;
+// We sort the sub-regions according to the distance between the start of the
+// MemoryRegion of which they are a part, and the start of the SubRegion.  This
+// will generally mean that the highest-numbered SubRegion-- the one we keep
+// free at all costs-- will be the end of the first block which we allocated
+// (which under most situations will be the largest block).
+void CuMemoryAllocator::SortSubregions() {
+  largest_free_block_.resize(subregions_.size());
+
+  std::vector<std::pair<size_t, SubRegion*> > pairs;
+  for (size_t i = 0; i < subregions_.size(); i++) {
+    SubRegion *subregion = subregions_[i];
+    MemoryRegion &memory_region = memory_regions_[subregion->memory_region];
+    size_t distance = subregion->begin - memory_region.begin;
+    pairs.push_back(std::pair<size_t, SubRegion*>(distance, subregion));
+  }
+  std::sort(pairs.begin(), pairs.end());
+  for (size_t i = 0; i < subregions_.size(); i++) {
+    subregions_[i] = pairs[i].second;
+    subregions_[i]->subregion_index = i;
+    if (subregions_[i]->free_blocks.empty())
+      largest_free_block_[i] = 0;
+    else
+      largest_free_block_[i] = subregions_[i]->free_blocks.begin()->first;
+  }
 }
 
-void CuMemoryAllocator::MruCache::Insert(const MemoryRequest &request,
-                                         const CachedMemoryElement &element) {
-  list_.push_back(request);
-  map_[request].push_back(std::pair<CachedMemoryElement, ListIterType>(
-      element,
-      --list_.end()));
-}
+CuMemoryAllocator g_cuda_allocator;
 
-size_t CuMemoryAllocator::MruCache::RemoveLeastRecentlyUsed() {
-  // Remove least-recently-used element from cache.
-  KALDI_ASSERT(!list_.empty());
-  MemoryRequest request = list_.front();
-  MapType::iterator iter = map_.find(request);
-  KALDI_ASSERT(iter != map_.end());
-  MapValueType &queue = iter->second;
-  KALDI_ASSERT(!queue.empty());
-  // least recently used elements are at the front of the queue.
-  std::pair<CachedMemoryElement, ListIterType> &p = queue.front();
-  KALDI_ASSERT(p.second == list_.begin());
-  CU_SAFE_CALL(cudaFree(p.first.pointer));
-  queue.pop_front();
-  if (queue.empty())
-    map_.erase(request);
-  list_.pop_front();
-  return request.first * request.second;
-}
 
-CuMemoryAllocator::MruCache& CuMemoryAllocator::MruCache::operator = (
-    const CuMemoryAllocator::MruCache &other) {
-  KALDI_ASSERT(other.list_.empty());
-  return *this;
-}
-CuMemoryAllocator::MruCache::MruCache(
-    const CuMemoryAllocator::MruCache &other) {
-  KALDI_ASSERT(other.list_.empty());
-}
+}  // namespace kaldi
 
 
+#endif // HAVE_CUDA
 
 
-}
+namespace kaldi {
 
+// Define/initialize this global variable.  It was declared in cu-allocator.h.
+// This has to be done outside of the ifdef, because we register the options
+// whether or not CUDA is compiled in (so that the binaries accept the same
+// options).
+CuAllocatorOptions g_allocator_options;
 
-#endif // HAVE_CUDA
+}
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index 0f96315e848..20425704a2b 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -23,54 +23,137 @@
 #define KALDI_CUDAMATRIX_CU_ALLOCATOR_H_
 
 #if HAVE_CUDA == 1
-
 #include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#endif
+
 #include <map>
+#include <set>
 #include <mutex>
 #include <list>
 #include <queue>
+#include <thread>
 #include <iostream>
-#include <cuda.h>
-#include <cuda_runtime_api.h>
 #include "base/kaldi-common.h"
 #include "util/stl-utils.h"
+#include "itf/options-itf.h"
 
 namespace kaldi {
 
 
 // For now we don't give the user a way to modify these from the command line.
+// or the code, it just documents what the default options are.  To change
+// the options, you have to do it in the code.
 struct CuAllocatorOptions {
-  // memory_factor is the total amount of (allocated + cached) memory that we
-  // allow to be held, relative to the max amount of memory the program has ever
-  // allocated.  It will increase the amount of memory the program will
-  // potentially consume, by this factor.
-  BaseFloat memory_factor;
-
-  // This is the minimum amount of memory that we will delete when we are forced
-  // to delete stuff, relative to the max amount of memory the program has ever
-  // allocated.  This should be less than memory_factor - 1.0 and > 0.  It
-  // shouldn't be too critical.  The reason it exists is to avoid calling the
-  // cleanup code and only releasing very small amounts of memory, because there
-  // is a constant overhead proportional to the number of buckets.
-  BaseFloat delete_factor;
-
-  CuAllocatorOptions(): memory_factor(1.3),
-                        delete_factor(0.001) { }
+  // True if we are going to actually cache memory allocations on this device.
+  // You'd normally set it to false only if you wanted to debug a possible
+  // memory problem using cuda-memcheck or cuda-gdb.  It will be slower, but
+  // using CUDA's native allocator allows those tools to detect out-of-region
+  // memory accesses.
+  bool cache_memory;
+
+  // The proportion of the device's memory that the CuAllocator allocates to
+  // start with; by default this is 0.8, although if you want to share the
+  // device (not recommended!) you should set this lower.
+  BaseFloat memory_proportion;
+
+  // The target number of subregions of the entire CUDA device memory (we'll
+  // start with a smaller number of memory_proportion is << 1).  Kind of
+  // a tuning knob.. more regions will make it more aggressively consolidate
+  // memory low addresses.
+  int32 num_subregions;
+
+  CuAllocatorOptions():
+      cache_memory(true), memory_proportion(0.5), num_subregions(20) { }
+
+  void Register(OptionsItf *po) {
+    po->Register("cuda-cache-memory", &cache_memory, "True if you want "
+                 "to use the caching allocator.  Set this to false only if you "
+                 "want to use cuda-memcheck or cuda-gdb; it will be slower.");
+    po->Register("cuda-memory-proportion", &memory_proportion,
+                 "Proportion of the GPU device memory that the allocator "
+                 "should allocate at the start");
+  }
 
   void Check() {
-    KALDI_ASSERT(delete_factor < memory_factor - 1.0 && delete_factor > 0.0);
+    // don't let it get too close to 1;
+    KALDI_ASSERT(memory_proportion >= 0.05 && memory_proportion < 0.99);
   }
 };
 
+extern CuAllocatorOptions g_allocator_options;
+
+inline void RegisterCuAllocatorOptions(OptionsItf *po) {
+  g_allocator_options.Register(po);
+}
 
 
+} // namespace kaldi
+
+
+#if HAVE_CUDA == 1
+namespace kaldi {
+
+/**
+   This class allocates large regions of memory from the GPU and allocates
+   sub-blocks of it for the user.  This is needed because the CUDA malloc and
+   free routines are very slow.
+
+   The user doesn't access this class directly, it is accessed via the CuDevice
+   object.  The CuDevice class allocates memory using this class's Malloc() and
+   MallocPitch() functions, and frees them with its Free() function, and this
+   class caches the memory blocks to avoid calling the CUDA library's
+   malloc/free functions too often.  If the application is using multiple
+   threads, it's necessary to lock this class before using it, and in that case
+   the CuDevice class calls the MallocLocking() and MallocPitchLocking()
+   versions of the allocation functions (but the user should call
+   CuDevice::AllowMultithreading() if the application plans to use GPU
+   functionality from multiple CPU threads).
+
+   NOTE ON SYNCHRONIZATION: if multiple CUDA streams are used there is a
+   potential problem with any caching allocator which shares its pool across
+   CUDA streams.  That is: if a memory block is freed by stream 1 and allocated to
+   stream 2, an operation might start in stream 2 before stream 1 has finished
+   working with that memory location.  We solve this here using a rather low-tech
+   solution, relying on calling SynchronizeGpu() which submits a no-op kernel
+   into the legacy default stream.  Each
+   time CuMemoryAllocator()::Free() is called and we cache the memory block
+   in this class, we record the thread-id of the CPU thread from which it was
+   freed, as well as a timestamp (the t_ member of CuMemoryAllocator, which
+   we increment every time the class is used).  When we allocate memory
+   that was cached, we try to allocate it from a block that was relased by the
+   same CPU thread; and if that is not possible and we haven't called
+   SynchronizeGpu() since the block was freed, then we call
+   SynchronizeGpu().  The hope is that this will happen quite rarely.
+   Note that this is based on the assumption that the user is using the
+   per-thread default stream (indeed this is how we compile).  If the
+   user were to make explicit use of CUDA streams, this mechanism would
+   not necessarily be sufficient to prevent data-race conditions and the
+   user might have to take further precautions.
+
+   NOTE ON FRAGMENTATION: Memory fragmentation is one of the main problems that
+   you'll run into with allocators like this.  This allocator will allocate a
+   small number of large regions of memory, and allocate smaller pieces of
+   memory that it splits off from the regions as needed.  It will always merge
+   adjacent blocks as much as it can when the user frees memory.  The main
+   heuristic to avoid memory fragmenting too much is that it always allocates,
+   where possible, from memory that's as close as possible to the start of a
+   memory region.  This will tend to keep all the small allocations together at
+   the beginning of the memory region, and hopefully keep large blocks availale
+   at the end.  The mechanism to always allocate from as close as possible to
+   the start of the memory region, is that we split up the memory regions into
+   a small number of sub-regions and, when handling a request for allocation,
+   allocate it from the lowest-numbered sub-region that can meet a request for
+   that size.  (Note: we can allocate blocks that span sub-regions, so this
+   approach does not limit the block size we can allocate).
+
+*/
 
-// Class that caches memory for us (the CUDA
-// malloc and free routines are very slow).
-// This is a member of the CuDevice class.
 class CuMemoryAllocator {
  public:
-  /// Allocates memory on the CUDA device, of size 'size'.
+  /// Allocates memory on the CUDA device, of size 'size'.  size == 0 is not
+  /// allowed and is an error.
   void* Malloc(size_t size);
 
   /// Allocation function for matrix-like things.
@@ -95,156 +178,174 @@ class CuMemoryAllocator {
     Free(ptr);
   }
 
+  void PrintMemoryUsage() const;
 
-  // the maximum amount of memory that was ever allocated in the lifetime of the
-  // program, in bytes.
-  size_t MaxMemoryAllocated() const { return max_bytes_allocated_; }
-
-  // memory held in the cache currently, in bytes.
-  size_t MemoryCached() const { return cur_bytes_allocated_ - cur_bytes_used_; }
-
-  // memory that's cached plus memory that's allocated, in bytes.
-  size_t MemoryAllocated() const { return cur_bytes_allocated_; }
+  CuMemoryAllocator();
 
-  void PrintMemoryUsage() const;
+  // Allows you to set options: must be called before any Malloc function is
+  // called on this class.  It's done this way so the options can be changed
+  // by the user (c.f. RegisterCuAllocatorOptions()) before the options are read.
+  void SetOptions(const CuAllocatorOptions &opts) { opts_ = opts; }
 
-  CuMemoryAllocator(CuAllocatorOptions opts);
  private:
 
-  void FreeSomeCachedMemory(size_t bytes_to_free);
+  struct SubRegion;
+
+  struct MemoryBlock {
+    char *begin;  // The beginning of the block (in CUDA memory)
+    char *end;  // the end of the block (in CUDA memory)
+    SubRegion *subregion;  // Pointer to the SubRegion to which this memory
+                            // block belongs.
+    bool allocated;  // True if this MemoryBlock has currently been given to the
+                     // user; false if not.
+
+    size_t t;        // Zero if this memory block was never given to the user;
+                     // otherwise, the time value (t_ in the CuAllocator class)
+                     // when it was most recently either allocated to the user
+                     // or freed by the user.
+
+    std::thread::id thread_id;  // If allocated == false and t > 0 (i.e. this
+                                // memory block was released by the user), the
+                                // thread-id of the user thread that freed this
+                                // block, or the invalid thread-id as created by
+                                // the constructor of std::thread::id if this
+                                // block was created by merging blocks from
+                                // different threads.  Required for
+                                // synchronization; and note that we assume
+                                // there is one CUDA stream per CPU thread.
+
+    MemoryBlock *next;  // The next MemoryBlock within this MemoryRegion (or
+                        // NULL if this is the last one); its 'begin' would be
+                        // the same as the 'end' of this block.
+    MemoryBlock *prev;  // The previous MemoryBlock within this MemoryRegion (or
+                        // NULL if this is the first one); its 'end' would be the
+                        // same as the 'begin' of this block.
 
-  // This calls CudaMallocPitch, checks for errors (dies if it has to), and
-  // returns the result.  It's up to the caller to do all the bookkeeping though.
-  inline void* MallocPitchInternal(size_t row_bytes, size_t num_rows, size_t *pitch);
+  };
 
-  typedef std::pair<size_t, size_t> MemoryRequest;  // (row_bytes, num_rows).
-  struct CachedMemoryElement {
-    void *pointer;  // the CUDA memory location that we own
-    size_t t;       // time value when we put this in the cache.
-    size_t pitch;   // pitch of this memory region (c.f. cudaMallocPitch()).
-    CachedMemoryElement() { }
-    CachedMemoryElement(void *pointer, size_t t, size_t pitch):
-        pointer(pointer), t(t), pitch(pitch) { }
+  // a MemoryRegion is a large piece of memory that we allocated via CudaMalloc.
+  // there normally won't be more than about 3 or 4 of these.
+  // We'll identify MemoryRegions by a size_t (e.g 0, 1, 2, 3... ) which is an
+  // index into the memory_regions_ vector.
+  struct MemoryRegion {
+    char *begin;  // 'begin' is the start of the memory region.
+    char *end;  // 'end' is the end of the memory region.
+    SubRegion *subregion_begin;  // The first SubRegion that belongs to this
+                                 // MemoryRegion.
+    MemoryBlock *block_begin;  // The first MemoryBlock that belongs to this
+                               // MemoryRegion.
   };
 
-  // This class caches a map from MemoryRequest to a list of CachedMemoryElements,
-  // and gives us access to the least-recently-used element for efficient.
-  // removal.
-  // We will have an instance of this class for each power-of-2 of size in
-  // bytes.  This makes it easier to, when we need to delete something, find
-  // the item for which the (time-since-used * size-in-bytes) is approximately
-  // greatest.
-  class MruCache {
-   public:
-    size_t LeastRecentTime() const;  // t value of least recent CachedMemoryElement (0
-                                     // if empty).
-
-    size_t RemoveLeastRecentlyUsed();  // Remove least-recently-used element
-                                       // from cache.  Return size in bytes of
-                                       // that removed memory region.  Crash if
-                                       // this was empty.
-
-    // Attempts lookup of the most recently cached element corresponding to
-    // 'request'.  If available, removes it from the cache and puts it to
-    // 'output', and returns true.  Otherwise returns false.
-    bool Lookup(const MemoryRequest &request,
-                CachedMemoryElement *output);
-
-    // Inserts this CachedMemoryElement to the list of CachedMemoryElements for this
-    // MemoryRequest.  The time in the CachedMemoryElement is expected to be greater
-    // than times in previously supplied CachedMemoryElements.
-    void Insert(const MemoryRequest &request,
-                const CachedMemoryElement &element);
-
-    struct MemoryRequestHasher {
-      // input is interpreted as (row_bytes, num_rows).  row_bytes will always
-      // be a multiple of 4, and num_rows will frequently be a multiple of
-      // powers of 2 also.  We need to shift right and add so that there will be
-      // some action in the lower-order bits.
-      size_t operator () (const std::pair<size_t,size_t> &p) const noexcept {
-        size_t temp = p.first + 1867 * p.second;
-        return temp + (temp >> 2) + (temp >> 8);
-      }
-    };
-
-    MruCache() { }
-    // Define these to make inclusion in std::vector possible, but make them
-    // fail if called on anything but empty cache objects-- we never resize
-    // the vector of caches after initializing it.
-    MruCache &operator = (const MruCache &other);
-    MruCache(const MruCache &other);
-   private:
-    typedef std::list<MemoryRequest> ListType;
-    typedef std::list<MemoryRequest>::iterator ListIterType;
-    typedef std::deque<std::pair<CachedMemoryElement, ListIterType> > MapValueType;
-    typedef unordered_map<MemoryRequest, MapValueType,
-                          MemoryRequestHasher> MapType;
-    // 'list_' contains MemoryRequests with the most recent on the back (where they are added),
-    // and least recent on the front (where they are removed by RemoveLeastRecentlyUsed, although
-    // they are also removed from random parts of the list by Lookup().
-    // There will in general be duplicates of MemoryRequests in the list, as
-    // many as there are entries in the MapValueType.
-    ListType list_;
-    // 'map_' maps from a MemoryRequest to a queue of (memory-element,
-    // iterator), with the most-recently-added things at the back; we remove
-    // things from the front of these queues (oldest) inside
-    // RemoveLeastRecentlyUsed(), and from the back (newest) in Lookup.
-    MapType map_;
+  // a SubRegion is a smaller zone of memory within a MemoryRegion.  For
+  // example, we divide the first MemoryRegion we allocate into 10 blocks, and
+  // if we allocate blocks of memory later on, we'll sub-divide them into blocks
+  // of about the same size.  A SubRegion is just a largish bin into which we
+  // put any blocks of memory that happen to start within that SubRegion;
+  // actually, memory blocks may cross over the boundaries of SubRegions.  The
+  // motivation for dividing up MemoryRegions into SubRegions is that it allos
+  // us an efficient mechanism to segregate smaller memory blocks into higher
+  // memory and larger ones into lower memory: for each allocation, we allocate
+  // it from the highest-numbered SubRegion that is able to allocate something of
+  // that size.  Over time, this will lead to smaller memory blocks being
+  // concentrated in higher-numbered SubRegions.
+  struct SubRegion {
+    size_t memory_region;  // This is an index into the memory_regions_ vector
+                           // which identifies which MemoryRegion this SubRegion
+                           // is a part of.
+    size_t subregion_index;  // The index of this SubRegion within the
+                             // subregions_ vector; this can change when we
+                             // allocate more MemoryRegions.
+    char *begin;  // 'begin' is the start of the memory in this SubRegion.
+    char *end;    // 'end' is the end of the memory in this SubRegion.
+
+    // Contains the free MemoryBlocks starting within this SubRegion.
+    std::set<std::pair<size_t, MemoryBlock*> > free_blocks;
+
+    // Pointer to the next SubRegion within this MemoryRegion (i.e. the SubRegion
+    // whose begin equals this one's end), or NULL if this is the last one.
+    SubRegion *next;
   };
 
+  // Tries to allocate CUDA memory of the given size; will crash if it was not
+  // able to.
+  inline void* MallocInternal(size_t size);
 
-  inline MruCache &GetCacheForSize(size_t num_bytes);
+  // Allocates from a given SubRegion, after we have determined that it
+  // can satisfy this request.  Broken out of MallocInternal for clarity.
+  inline void* MallocFromSubregion(SubRegion *subregion, size_t size);
 
-  CuAllocatorOptions opts_;
 
-  // indexed by log_2 (amount of memory requested), the caches.
-  std::vector<MruCache> caches_;
+  // Splits the given MemoryBlock so that one piece is of size 'size', and
+  // returns the piece which is of size 'size'.  The caller guarantees that
+  // 'size' is less than the current size of the memory block, that 'block' is
+  // not currently allocated (i.e. block->allocated == false).  This function
+  // assumes that, at entry, 'block' is not present in its subregion's
+  // 'free_blocks' (because the caller has removed it), and it takes
+  // responsibility for entering the 'unused' part (the part we're not
+  // returning) into its subregion's 'free_blocks' by calling AddToFreeBlocks().
+  inline MemoryBlock *SplitBlock(MemoryBlock *block, size_t size);
 
-  size_t cur_bytes_allocated_;  // number of bytes currently owned by callers or
-                                // cached.
-  size_t max_bytes_allocated_;  // the max over all time, of cur_bytes_allocated_.
-  size_t cur_bytes_used_;  // number of bytes currently owned by callers.
-  size_t max_bytes_used_;  // the max over all time, of cur_bytes_used_.
-  size_t t_;  // time counter, incremented with each call.
-  size_t num_user_allocations_;  // number of times user calls Malloc*
-  size_t num_system_allocations_;  // number of times we call cudaMalloc*.
-  double tot_time_taken_in_cuda_malloc_;  // time in cudaMalloc
-  double tot_time_taken_in_cuda_malloc_pitch_;  // time in cudaMallocPitch
-  double tot_time_taken_in_cuda_free_;  // time in cudaFree
-  double tot_time_taken_in_malloc_pitch_;  // time in this->MallocPitch()
-
-
-  // a memory element is 'used' when it is currently possessed by the caller
-  // (and is not in our cache).
-  struct UsedMemoryElement {
-    size_t row_bytes;
-    size_t num_rows;
-    size_t pitch;
-    UsedMemoryElement() { }
-    UsedMemoryElement(size_t row_bytes, size_t num_rows, size_t pitch):
-        row_bytes(row_bytes), num_rows(num_rows), pitch(pitch)  { }
-  };
+  // Removes this block from the 'free_blocks' set of the SubRegion to which
+  // it belongs.  This is called when allocating a block, and from other places.
+  void RemoveFromFreeBlocks(MemoryBlock *block);
 
-  struct PointerHasher {
-    size_t operator() (const void *arg) const noexcept {
-      // the last few bits tend to be very predictable, for alignment reasons (CUDA
-      // allocation may align on 256 byte or 512 byte boundaries or something similar).
-      size_t temp = reinterpret_cast<size_t>(arg);
-      return (temp >> 4) + (temp >> 9);
-    }
-  };
+  // Adds this block to the 'free_blocks' set of the SubRegion to which it
+  // belongs.  This is called when freeing a block, and from other places.
+  void AddToFreeBlocks(MemoryBlock *block);
 
-  // This is a map from memory locations owned by the user, so we can recover
-  // the information when people call Free() and we add it back into the cache.
-  unordered_map<void*, UsedMemoryElement, PointerHasher> used_map_;
+  // This function is called when an allocation failed and we need to try to
+  // allocate more memory from the evice.  The 'size' is the size of the
+  // requested memory block whose allocation failed-- it's provided so that
+  // we can be sure to allocate a new region of at least this size.
+  void AllocateNewRegion(size_t size);
+
+  // Called from AllocateNewRegion(), this ensures that the subregions are
+  // sorted as we want (which is a kind of heuristic that will be discussed in
+  // the code), and it also recomputes the largest_free_block_ array.
+  void SortSubregions();
 
-  // this is only locked by the '*Locking' versions of the functions.
-  std::mutex mutex_;
 
+
+  CuAllocatorOptions opts_;
+
+  std::vector<MemoryRegion> memory_regions_;
+
+  std::vector<SubRegion*> subregions_;
+
+  // For each SubRegion in sub_regions_, this vector gives us the size of the
+  // largest free block present in that SubRegion, which is equal to
+  // sub_regions_[i]->free_blocks.begin()->first.  It allows us to fairly
+  // efficiently find the lowest-numbered SubRegion which can handle a
+  // particular request for memory.
+  std::vector<size_t> largest_free_block_;
+
+  size_t t_;  // time counter, incremented with each call.
+  size_t synchronize_gpu_t_;     // value of t_ at the last time we called
+                                 // SynchronizeGpu().
+  size_t num_synchronizations_;  // number of times we called SynchronizeGpu()
+  double tot_time_taken_;  // Total time taken in calls to this object.
+  double malloc_time_taken_;  // Total time we spent calling cudaMalloc().
+
+  // This is a map from memory locations currently owned by the user, to the
+  // MemoryBlock which stores the information about that location.
+  std::unordered_map<void*, MemoryBlock*> allocated_block_map_;
+
+  // this is only locked by the '*Locking' versions of the functions (necessary only
+  // in multi-threaded applications).
+  std::mutex mutex_;
 };
 
 
-}  // namespace
+// This function returns some printable information about the memory used
+// as a string: an example showing the format is:
+//  "free: 10M, used: 490M, total: 500M: free/total: 0.02"
+// In addition, if the pointers 'free' and 'total' are non-NULL, it will
+// output to them the free memory and the total memory of the device.
+std::string GetFreeGpuMemory(int64* free, int64* total);
+
+extern CuMemoryAllocator g_cuda_allocator;
+
+}  // namespace kaldi
 
 #endif // HAVE_CUDA
 
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index c5114ed8b22..b8d6e7edbf5 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -23,7 +23,6 @@
 
 
 #if HAVE_CUDA == 1
-
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
@@ -42,23 +41,15 @@
 #include "base/kaldi-utils.h"
 #include "util/common-utils.h"
 #include "util/kaldi-io.h"
+// the following is for cuda_legacy_noop().
+#include "cudamatrix/cu-kernels-ansi.h"
 
 namespace kaldi {
 
-/**
-   This function was added by Dan in July 2015 after upgrading on the CLSP
-   cluster to the CUDA 7.0 toolkit; the old mechanism of just calling
-   cudaThreadSynchronize() [==cudaDeviceSynchronize()] and having it
-   automagically select a GPU (when exclusive mode is on) doesn't seem to work
-   any more, in situations where GPU 0 is already being used.  This works.  It's
-   not 100% clear if the fact that the old code wasn't working was a bug, or a
-   changed feature (the NVidia docs were never super-clear regarding device
-   initialization).  But regardless, changing to this new mechanism should be
-   harmless even if the problem was specific to the CLSP grid.
-*/
-
+/// This function attempts to get a CUDA device context on some available device
+/// by doing 'cudaFree(0)'.  If it succeeds it returns true; if it fails, it
+/// outputs some debugging information into 'debug_str' and returns false.
 static bool GetCudaContext(int32 num_gpus, std::string *debug_str) {
-
   // Our first attempt to get a device context is: we do cudaFree(0) and see if
   // that returns no error code.  If it succeeds then we have a device
   // context.  Apparently this is the canonical way to get a context.
@@ -88,53 +79,68 @@ static bool GetCudaContext(int32 num_gpus, std::string *debug_str) {
   return false;
 }
 
-/**
- * SelectGpuId(use_gpu)
- *
- * There are 3 'use_gpu' modes for GPU selection:
- * "yes"      -- Select GPU automatically (or get one by exclusive mode)
- *               and die if this fails.
- * "optional" -- Do as above, but if it fails, back off to CPU.
- * "no"       -- Run on CPU.
- *
- * In case of Compute exclusive mode, the GPU is selected by OS.
- *
- * Otherwise GPU selection is based on largest proportion of free memory.
- * This can eventually lead to multiple processes computing on single GPU,
- * which is slow. More practical is to use "compute exclusive mode".
- *
- * This method is to be called at the very beginning of the program
- * (before first allocation in cudamatrix), or not at all (default to CPU).
- *
- */
+
+void CuDevice::Initialize() {
+  // This function may be called in the following two situations:
+  //
+  // (1) in the main thread, only when a GPU is not currently being used, either
+  // within a call like CuDevice()::Instantiate().SelectGpuId(..)
+  // (where the Instantiate() call will call Initialize() before SelectGpuId()
+  // is called, just because of how Instantiate() works), or in a call
+  // to 'CuDevice::Instantiate().Enabled()'.  In this case it will just
+  // set initialized_ to true and notice that device_id_ == 1, and do nothing.
+  //
+  // (2) in threads created by the user, as soon as someone calls something that
+  //   might potentially use the GPU, via CuDevice()::Instantiate().
+  //   If device_id_ is >= 0, this will create the cuBLAS and cuSparse handles.
+  KALDI_ASSERT(!initialized_);
+  initialized_ = true;
+  if (device_id_ == -1) {
+    // There is nothing to do; we are not using a GPU.
+    return;
+  } else {
+    if (!multi_threaded_) {
+      multi_threaded_ = true;
+      KALDI_WARN << "For multi-threaded code that might use GPU, you should call "
+          "CuDevice()::Instantiate().AllowMultithreading() at the start of "
+          "the program.";
+    }
+    device_id_copy_ = device_id_;
+    cudaSetDevice(device_id_);
+    // Initialize CUBLAS.
+    CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_));
+    // Initialize the cuSPARSE library
+    CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
+
+  }
+}
+
 void CuDevice::SelectGpuId(std::string use_gpu) {
-  // Possible modes
+  if (device_id_ != -1) {
+    KALDI_ERR << "You cannot call SelectGpuId twice if, on the first time, "
+        "you requested a GPU.";
+  }
   if (use_gpu != "yes" && use_gpu != "no" && use_gpu != "optional" && use_gpu != "wait") {
     KALDI_ERR << "Please choose : --use-gpu=yes|no|optional|wait, passed '" << use_gpu << "'";
   }
-
-  // Make sure this function is not called twice!
-  if (Enabled()) {
-    KALDI_ERR << "There is already an active GPU " << active_gpu_id_
-              << ", cannot change it on the fly!";
-  }
-  // Allow the GPU to stay disabled
-  if (!Enabled() && use_gpu == "no") {
+  if (use_gpu == "no") {
     KALDI_LOG << "Manually selected to compute on CPU.";
     return;
   }
-
   // Check that we have a gpu available
   int32 num_gpus = 0;
 
   cudaError_t e = cudaGetDeviceCount(&num_gpus);
 
+  // Make sure the global allocator object has the up-to-date options.
+  g_cuda_allocator.SetOptions(g_allocator_options);
+
   if (num_gpus == 0) {
     if (use_gpu == "yes" || use_gpu == "wait") {
       KALDI_CUDA_ERR(e, "No CUDA GPU detected!");
     }
     if (use_gpu == "optional") {
-      KALDI_WARN << "Running on CPU!!! No CUDA GPU detected...";
+      KALDI_WARN << "No CUDA GPU detected; running on CPU since --use-gpu=optional specified.";
       return;
     }
   }
@@ -183,8 +189,8 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
                << " seconds before creating CUDA context";
   }
 
-  // Re-assure we have the context
-  KALDI_ASSERT(cudaSuccess == cudaThreadSynchronize());
+  // Double check that we have the context
+  KALDI_ASSERT(cudaSuccess == cudaDeviceSynchronize());
 
   // Check if the machine use compute exclusive mode
   if (IsComputeExclusive()) {
@@ -196,7 +202,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
     KALDI_WARN << "Not in compute-exclusive mode.  Suggestion: use "
         "'nvidia-smi -c 3' to set compute exclusive mode";
     // We want to choose the device more carefully, so release the CUDA context.
-    e = cudaThreadExit(); // deprecated, but for legacy reason not cudaDeviceReset
+    e = cudaDeviceReset();
     if (e != cudaSuccess) {
       KALDI_CUDA_ERR(e, "Failed to release CUDA context on a GPU");
     }
@@ -206,8 +212,8 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
       FinalizeActiveGpu();
       return;
     } else {
-      // Could not get GPU, after prevously having the CUDA context?
-      // Strange but not impossible...
+      // We could not get a GPU the second time, after prevously having the CUDA
+      // context.  Strange but not impossible.
       if (use_gpu == "yes") {
         KALDI_ERR << "Error acquiring GPU.";
       }
@@ -221,37 +227,38 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
 
 
 void CuDevice::FinalizeActiveGpu() {
-  // The device at this point should have active GPU, so we can query its name
-  // and memory stats and notify user which GPU is finally used.
+  // The device at this point should have an active GPU, so we can query its
+  // name and memory stats and notify user which GPU is being used.
 
-  // Get the device-id of active device:
+  // Get the device-id of the active device.
   {
-    int32 act_gpu_id;
-    cudaError_t e = cudaGetDevice(&act_gpu_id);
+    int device_id;
+    cudaError_t e = cudaGetDevice(&device_id);
     if (e != cudaSuccess) {
       KALDI_CUDA_ERR(e, "Failed to get device-id of active device.");
     }
-    // Remember the id of active GPU
-    active_gpu_id_ = act_gpu_id; // CuDevice::Enabled() is true from now on
+    device_id_ = device_id;
+    device_id_copy_ = device_id;
+    initialized_ = true;  // Prevent Initialize() from being called on this,
+                          // the main thread.
     // Initialize CUBLAS.
-    CUBLAS_SAFE_CALL(cublasCreate(&handle_));
+    CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_));
     // Initialize the cuSPARSE library
     CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
 
-    // Notify user which GPU is finally used
+    // Notify the user which GPU is being userd.
     char name[128];
-    DeviceGetName(name,128,act_gpu_id);
+    DeviceGetName(name,128, device_id);
 
-    CU_SAFE_CALL(cudaGetDeviceProperties(&properties_, act_gpu_id));
+    CU_SAFE_CALL(cudaGetDeviceProperties(&properties_, device_id));
 
-    KALDI_LOG << "The active GPU is [" << act_gpu_id << "]: " << name << "\t"
-              << GetFreeMemory(&free_memory_at_startup_, NULL) << " version "
+    KALDI_LOG << "The active GPU is [" << device_id << "]: " << name << "\t"
+              << GetFreeGpuMemory(&free_memory_at_startup_, NULL) << " version "
               << properties_.major << "." << properties_.minor;
   }
   return;
 }
 
-
 bool CuDevice::DoublePrecisionSupported() {
   if (!Enabled()) return true;
   return properties_.major > 1 || (properties_.major == 1 && properties_.minor >= 3);
@@ -261,10 +268,10 @@ bool CuDevice::DoublePrecisionSupported() {
 
 bool CuDevice::IsComputeExclusive() {
   // assume we already have an CUDA context created
-  KALDI_ASSERT(cudaSuccess == cudaThreadSynchronize());
+  KALDI_ASSERT(cudaSuccess == cudaDeviceSynchronize());
 
   // get the device-id and its device-properties
-  int32 gpu_id = -1;
+  int gpu_id = -1;
   cudaError_t e = cudaGetDevice(&gpu_id);
   if (e != cudaSuccess) {
     KALDI_CUDA_ERR(e, "Failed to get current device");
@@ -279,11 +286,9 @@ bool CuDevice::IsComputeExclusive() {
     case cudaComputeModeExclusive :
       return true;
       break;
-#if (CUDA_VERSION >= 4000)
     case cudaComputeModeExclusiveProcess :
       return true;
       break;
-#endif
     default :
       // in this case we release the GPU context...
       return false;
@@ -318,37 +323,35 @@ bool CuDevice::SelectGpuIdAuto() {
     switch(ret) {
       case cudaSuccess : {
         // create the CUDA context for the thread
-        cudaThreadSynchronize(); // deprecated, but for legacy not cudaDeviceSynchronize
+        cudaDeviceSynchronize();
         // get GPU name
         char name[128];
         DeviceGetName(name,128,n);
         // get GPU memory stats
         int64 free, total;
         std::string mem_stats;
-        mem_stats = GetFreeMemory(&free, &total);
+        mem_stats = GetFreeGpuMemory(&free, &total);
         // log
         KALDI_LOG << "cudaSetDevice(" << n << "): "
                   << name << "\t" << mem_stats;
 
-        // We have seen that in some cases GetFreeMemory returns zero
+        // We have seen that in some cases GetFreeGpuMemory returns zero
         // That will produce nan after division, which might confuse
         // the sorting routine. Or maybe not, but let's keep it clean
         if (total <= 0) {
-          KALDI_LOG << "Total memory reported for device " << n << " is zero (or less).";
+          KALDI_LOG << "Total memory reported for device " << n
+                    << " is zero (or less).";
         }
         float mem_ratio = total > 0 ? free/(float)total : 0;
         free_mem_ratio[n] = std::make_pair(n, mem_ratio);
 
         // destroy the CUDA context for the thread
-        cudaThreadExit(); // deprecated, but for legacy reason not cudaDeviceReset
+        cudaDeviceReset();
       } break;
-
-#if (CUDA_VERSION > 3020)
       case cudaErrorDeviceAlreadyInUse :
         KALDI_LOG << "cudaSetDevice(" << n << "): "
                   << "Device cannot be accessed, used EXCLUSIVE-THREAD mode...";
         break;
-#endif
       case cudaErrorInvalidDevice :
         KALDI_LOG << "cudaSetDevice(" << n << "): "
                   << "Device cannot be accessed, not a VALID CUDA device!";
@@ -366,7 +369,7 @@ bool CuDevice::SelectGpuIdAuto() {
   // the free_mem_ratio should be bigger than zero
   KALDI_ASSERT(free_mem_ratio[max_id].second > 0.0);
 
-  float dev_id;
+  int dev_id;
   float mem_ratio;
   do {
     // try to select the GPU in the best to worst order
@@ -382,7 +385,7 @@ bool CuDevice::SelectGpuIdAuto() {
       KALDI_WARN << "Cannot select this device: return code " << e
                  << ", Error message: \"" << cudaGetErrorString(e) << "\"";
     } else {
-      e = cudaThreadSynchronize(); // deprecated, but for legacy not cudaDeviceSynchronize
+      e = cudaDeviceSynchronize();
       if (e != cudaSuccess) {
         KALDI_WARN << "Cannot select this device: return code " << e
                    << ", Error message: \"" << cudaGetErrorString(e) << "\"";
@@ -403,10 +406,16 @@ bool CuDevice::SelectGpuIdAuto() {
 void CuDevice::AccuProfile(const char *function_name,
                            const CuTimer &timer) {
   if (GetVerboseLevel() >= 1) {
+    std::unique_lock<std::mutex> lock(profile_mutex_, std::defer_lock_t());
+    if (multi_threaded_)
+      lock.lock();
     std::string key(function_name);
-    cudaDeviceSynchronize();
+    // by passing 0 as the stream to cudaStreamSynchronize, we are using the
+    // per-thread default stream.  Since we compile with
+    // -DCUDA_API_PER_THREAD_DEFAULT_STREAM, this equates to a per-thread
+    // stream.
+    cudaStreamSynchronize(0);
     double elapsed = timer.Elapsed();
-
     if (profile_map_.find(key) == profile_map_.end())
       profile_map_[key] = elapsed;
     else
@@ -415,13 +424,8 @@ void CuDevice::AccuProfile(const char *function_name,
 }
 
 void CuDevice::PrintMemoryUsage() const {
-  if (Enabled()) {
-    allocator_.PrintMemoryUsage();
-    int64 free_memory_now;
-    GetFreeMemory(&free_memory_now, NULL);
-    KALDI_LOG << "Memory used (according to the device): "
-              << (free_memory_at_startup_ - free_memory_now) << " bytes.";
-  }
+  if (Enabled())
+    g_cuda_allocator.PrintMemoryUsage();
 }
 
 void CuDevice::PrintProfile() {
@@ -452,60 +456,6 @@ void CuDevice::PrintProfile() {
 }
 
 
-std::string CuDevice::GetFreeMemory(int64* free, int64* total) const {
-  // WARNING! the CUDA API is inconsistent accross versions!
-#ifdef _MSC_VER
-  size_t mem_free, mem_total;
-  cuMemGetInfo_v2(&mem_free, &mem_total);
-#else
-#if (CUDA_VERSION >= 3020)
-  // define the function signature type
-  size_t mem_free, mem_total;
-#else
-  unsigned int mem_free, mem_total;
-#endif
-  {
-    // we will load cuMemGetInfo_v2 dynamically from libcuda.so
-    // pre-fill ``safe'' values that will not cause problems
-    mem_free = 1; mem_total = 1;
-    // open libcuda.so
-    void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
-    if (NULL == libcuda) {
-      KALDI_WARN << "cannot open libcuda.so";
-    } else {
-      // define the function signature type
-      // and get the symbol
-#if (CUDA_VERSION >= 3020)
-      typedef CUresult (*cu_fun_ptr)(size_t*, size_t*);
-      cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2");
-#else
-      typedef CUresult (*cu_fun_ptr)(int*, int*);
-      cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo");
-#endif
-      if (NULL == dl_cuMemGetInfo) {
-        KALDI_WARN << "cannot load cuMemGetInfo from libcuda.so";
-      } else {
-        // call the function
-        dl_cuMemGetInfo(&mem_free, &mem_total);
-      }
-      // close the library
-      dlclose(libcuda);
-    }
-  }
-#endif
-  // copy the output values outside
-  if (NULL != free) *free = mem_free;
-  if (NULL != total) *total = mem_total;
-  // prepare the text output
-  std::ostringstream os;
-  os << "free:" << mem_free/(1024*1024) << "M, "
-     << "used:" << (mem_total-mem_free)/(1024*1024) << "M, "
-     << "total:" << mem_total/(1024*1024) << "M, "
-     << "free/total:" << mem_free/(float)mem_total;
-  return os.str();
-}
-
-
 void CuDevice::DeviceGetName(char* name, int32 len, int32 dev) {
   // prefill with something reasonable
   strncpy(name,"Unknown GPU",len);
@@ -554,15 +504,48 @@ void CuDevice::CheckGpuHealth() {
   AccuProfile(__func__, t);
 }
 
-CuDevice::CuDevice() :
-    active_gpu_id_(-1), debug_stride_mode_(false),
-    num_debug_stride_allocations_(0), allocator_(CuAllocatorOptions()),
-    multi_threaded_(false) { }
+CuDevice::CuDevice():
+    initialized_(false),
+    device_id_copy_(-1),
+    cublas_handle_(NULL),
+    cusparse_handle_(NULL) {
+}
+
+
+CuDevice::~CuDevice() {
+  if (cublas_handle_)
+    CUBLAS_SAFE_CALL(cublasDestroy(cublas_handle_));
+  if (cusparse_handle_)
+    CUSPARSE_SAFE_CALL(cusparseDestroy(cusparse_handle_));
+}
+
 
+// Each thread has its own copy of the CuDevice object.
+// Note: this was declared "static".
+thread_local CuDevice CuDevice::this_thread_device_;
 
-// The instance of the static singleton
-CuDevice CuDevice::global_device_;
+// define and initialize the static members of the CuDevice object.
+int32 CuDevice::device_id_ = -1;
+bool CuDevice::multi_threaded_ = false;
+unordered_map<std::string, double, StringHasher> CuDevice::profile_map_;
+std::mutex CuDevice::profile_mutex_;
+int64 CuDevice::free_memory_at_startup_;
+cudaDeviceProp CuDevice::properties_;
+bool CuDevice::debug_stride_mode_ = false;
+
+
+void SynchronizeGpu() {
+  cuda_legacy_noop();
+  CU_SAFE_CALL(cudaGetLastError());
 }
 
+}  // namespace kaldi
+
+#else  // #if HAVE_CUDA == 1
+
+namespace kaldi {
+// SynchronizeGpu() does nothing if we didn't compile for GPU.
+void SynchronizeGpu() { }
+}
 
-#endif // HAVE_CUDA
+#endif  // #if HAVE_CUDA == 1
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 99105355a8f..4967ccb5045 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -24,7 +24,6 @@
 #define KALDI_CUDAMATRIX_CU_DEVICE_H_
 
 #if HAVE_CUDA == 1
-
 #include <cublas_v2.h>
 #include <cusparse.h>
 #include <map>
@@ -41,61 +40,95 @@ namespace kaldi {
 class CuTimer;
 
 /**
- * Singleton object which represents the CUDA device
- * responsible for CUBLAS initilalisation, collects profiling info
+   This class contains code for selecting the CUDA device, initializing the
+   cuBLAS and cuSparse handles, and providing an interface for memory allocation
+   (which supports caching, to avoid the slowness of the CUDA memory allocator).
+
+   There is a separate instance of the CuDevice object for each thread of the
+   program, but many of its variables are static (hence, shared between all
+   instances).
+
+   We only (currently) support using a single GPU device; however, we support
+   multiple CUDA streams.  The expected programming model here is that you will
+   have multiple CPU threads, and each CPU thread automatically gets its own
+   CUDA stream because we compile with -DCUDA_API_PER_THREAD_DEFAULT_STREAM.
+
+   In terms of synchronizing the activities of multiple threads: The CuDevice
+   object (with help from the underlying CuAllocator object) ensures that the
+   memory caching code won't itself be a cause of synchronization problems,
+   i.e. you don't have to worry that when you allocate with CuDevice::Malloc(),
+   the memory will still be in use by another thread on the GPU.  However, it
+   may sometimes still be necessary to synchronize the activities of multiple
+   streams by calling the function SynchronizeGpu()-- probably right before a
+   thread increments a semaphore, right after it waits on a semaphore, or
+   right after it acquires a mutex, or something like that.
+
  */
 class CuDevice {
- // Singleton object (there should only be one instantiated per program)
  public:
-  static inline CuDevice& Instantiate() { return global_device_; }
 
-  inline cublasHandle_t GetHandle() { return handle_; }
+  // You obtain the CuDevice for the current thread by calling
+  //   CuDevice::Instantiate()
+  // At the beginning of the program, if you want to use a GPU, you
+  // should call CuDevice::Instantiate().SelectGpuId(..).
+  static inline CuDevice& Instantiate() {
+    CuDevice &ans = this_thread_device_;
+    if (!ans.initialized_)
+      ans.Initialize();
+    return ans;
+  }
+
+  inline cublasHandle_t GetCublasHandle() { return cublas_handle_; }
   inline cusparseHandle_t GetCusparseHandle() { return cusparse_handle_; }
 
-  // We provide functions Malloc, MallocPitch and Free which replace cudaMalloc,
-  // cudaMallocPitch and cudaFree.  Their function is to cache the results of
-  // previous allocations to avoid the very large overhead that CUDA's
-  // allocation seems to give for some setups.
+  // We provide functions Malloc(), MallocPitch() and Free() which replace
+  // cudaMalloc(), cudaMallocPitch() and cudaFree().  Their function is to cache
+  // the results of previous allocations to avoid the very large overhead that
+  // CUDA's allocation seems to give for some setups.
   inline void* Malloc(size_t size) {
-    return multi_threaded_ ? allocator_.MallocLocking(size) :
-        allocator_.Malloc(size);
+    return multi_threaded_ ? g_cuda_allocator.MallocLocking(size) :
+        g_cuda_allocator.Malloc(size);
   }
 
   inline void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) {
     if (multi_threaded_) {
-      return allocator_.MallocPitchLocking(row_bytes, num_rows, pitch);
+      return g_cuda_allocator.MallocPitchLocking(row_bytes, num_rows, pitch);
     } else if (debug_stride_mode_) {
       // The pitch bucket size is hardware dependent.
       // It is 512 on K40c with CUDA 7.5
       // "% 8" ensures that any 8 adjacent allocations have different pitches
       // if their original pitches are same in the normal mode.
-      return allocator_.MallocPitch(
-          row_bytes + 512 * ((num_debug_stride_allocations_++) % 8), num_rows,
+      return g_cuda_allocator.MallocPitch(
+          row_bytes + 512 * RandInt(0, 4), num_rows,
           pitch);
     } else {
-      return allocator_.MallocPitch(row_bytes, num_rows, pitch);
+      return g_cuda_allocator.MallocPitch(row_bytes, num_rows, pitch);
     }
   }
+
   inline void Free(void *ptr) {
-    if (multi_threaded_) allocator_.FreeLocking(ptr);
-    else allocator_.Free(ptr);
+    if (multi_threaded_) g_cuda_allocator.FreeLocking(ptr);
+    else g_cuda_allocator.Free(ptr);
   }
 
-  /// Select a GPU for computation, the 'use_gpu' modes are:
-  ///  "yes"      -- Select GPU automatically and die if this fails.
+  /// Select a GPU for computation.  You are supposed to call this function just
+  /// once, at the beginning of the program (from the main thread), or not at
+  /// all.
+  /// The 'use_gpu' modes are:
+  ///  "yes" -- Select GPU automatically and die if this fails.  If you have set
+  ///           the GPUs to exclusive mode it will select one
+  ///           pseudo-randomly; otherwise it will choose whichever one has
+  ///           the most free memory (but we recommend to set GPUs to
+  ///           exclusive mode, or controlling which GPU to use by setting
+  ///           the variable CUDA_VISIBLE_DEVICES to the id of the GPU you
+  ///           want the program to use.
   ///  "optional" -- Do as above, but if it fails, back off to CPU.
   ///  "no"       -- Run on CPU.
-  ///  (more comments in cu-device.cc)
   void SelectGpuId(std::string use_gpu);
 
   /// Check if the CUDA GPU is selected for use
   bool Enabled() const {
-    return (active_gpu_id_ > -1);
-  }
-
-  /// Get the active GPU id
-  int32 ActiveGpuId() {
-    return active_gpu_id_;
+    return (device_id_ > -1);
   }
 
   /// Returns true if either we have no GPU, or we have a GPU
@@ -106,21 +139,19 @@ class CuDevice {
   /// are printed out when you call PrintProfile().  However,
   /// it only does something if VerboseLevel() >= 1.
   void AccuProfile(const char *function_name, const CuTimer &timer);
+
+  /// Print some profiling information using KALDI_LOG.
   void PrintProfile();
 
+  /// Print some memory-usage information using KALDI_LOG.
   void PrintMemoryUsage() const;
 
   /// The user should call this if the program plans to access the GPU (e.g. via
   /// using class CuMatrix) from more than one thread.  If you fail to call this
-  /// for a multi-threaded program, it will occasionally segfault.
+  /// for a multi-threaded program, it may occasionally segfault (and also
+  /// the code will detect that you failed to call it, and will print a warning).
   inline void AllowMultithreading() { multi_threaded_ = true; }
 
-  void ResetProfile() {
-    profile_map_.clear();
-  }
-
-  /// Get the actual GPU memory use stats
-  std::string GetFreeMemory(int64* free = NULL, int64* total = NULL) const;
   /// Get the name of the GPU
   void DeviceGetName(char* name, int32 len, int32 dev);
 
@@ -153,22 +184,33 @@ class CuDevice {
   /// (i.e. from outside the class), call this only if Enabled() returns true.
   bool IsComputeExclusive();
 
- private:
   CuDevice();
+
+  ~CuDevice();
+ private:
   CuDevice(CuDevice&); // Disallow.
   CuDevice &operator=(CuDevice&);  // Disallow.
 
 
-  static CuDevice global_device_;
-  cublasHandle_t handle_;
-  cusparseHandle_t cusparse_handle_;
+  /// The Initialize() function exists to do the following, in threads other
+  /// than the main thread, and only if we are using a GPU: call
+  /// cudaSetDevice(), and set up cublas_handle_ and cusparse_handle_.  It does
+  /// get called in the main thread (see documentation by its definition), but
+  /// does nothing interesting there.
+  void Initialize();
 
-  /// Automatically select GPU and get CUDA context.  Returns true on success.
+  /// Automatically select GPU and get CUDA context (this is only called, from
+  /// SelectGpuId(), if the GPUs are in non-exclusive mode).  Returns true on
+  /// success.
   bool SelectGpuIdAuto();
 
-  /// Try to get CUDA context on manually selected GPU.  Return true on success.
-  bool SelectGpuIdManual(int32 gpu_id);
-
+  /// This function, called from SelectGpuId(), is to be called when a
+  /// GPU context corresponding to the GPU we want to use exists; it
+  /// works out the device-id, creates the cuBLAS and cuSparse handles,
+  /// and prints out some information that's useful for debugging.
+  /// It also sets initialized_ to true, to suppress Initialize() from
+  /// being called on this, the main thread, in future, since
+  /// that would try to create the handles again.
   void FinalizeActiveGpu();
 
   /// Should only be called if Enabled() == true.
@@ -177,29 +219,58 @@ class CuDevice {
   /// Should only be called if Enabled() == true.
   int32 MinorDeviceVersion();
 
-  unordered_map<std::string, double, StringHasher> profile_map_;
 
-  /// active_gpu_id_ values:
-  /// -3 default (default, the SelectGpuId was not called, we did not want to use GPU)
-  /// -2 SelectGpuId was called, but no GPU was present
-  /// -1 SelectGpuId was called, but the GPU was manually disabled
-  /// 0..N Normal GPU IDs
-  int32 active_gpu_id_;
+  // Each thread has its own CuDevice object, which contains the cublas and
+  // cusparse handles.  These are unique to the thread (which is what is
+  // recommended by NVidia).
+  static thread_local CuDevice this_thread_device_;
+
+  // The GPU device-id that we are using.  This will be initialized to -1, and will
+  // be set when the user calls
+  //  CuDevice::Instantiate::SelectGpuId(...)
+  // from the main thread.  Background threads will, when spawned and when
+  // CuDevice::Instantiate() is called from them the first time, will
+  // call cudaSetDevice(device_id))
+  static int32 device_id_;
 
-  int64 free_memory_at_startup_;
+  // This will automatically be set to true if the application has multiple
+  // threads that access the GPU device.  It is used to know whether to
+  // use locks when accessing the allocator and the profiling-related code.
+  static bool multi_threaded_;
 
-  cudaDeviceProp properties_;
+  // The variable profile_map_ will only be used if the verbose level is >= 1;
+  // it will accumulate some function-level timing information that is printed
+  // out at program end.  This makes things a bit slower as we have to call
+  // cudaDeviceSynchronize() to make the timing information meaningful.
+  static unordered_map<std::string, double, StringHasher> profile_map_;
+  // profile_mutex_ guards profile_map_ in case multi_threaded_ is true.
+  static std::mutex profile_mutex_;
 
-  // there used to be a 'bool verbose_' here.  I'm leaving a placeholder here
-  // instead of removing it because it causes particularly hard-to-debug errors
-  // if compilation is not done right (e.g. make depend was not done), and this
-  // class's members move about.
-  bool unused_;
-  bool debug_stride_mode_;
-  uint32 num_debug_stride_allocations_;
+  // free_memory_at_startup_ is just used in printing the memory used according
+  // to the device.
+  static int64 free_memory_at_startup_;
+  static cudaDeviceProp properties_;
+
+  // If set to true by SetDebugStrideMode(), code will be activated to use
+  // pseudo-random stride values when allocating data (to detect errors which
+  // otherwise would be rare).
+  static bool debug_stride_mode_;
+
+
+  // The following member variable is initialized to false; if the user calls
+  // Instantiate() in a thread where it is still false, Initialize() will be
+  // called, in order to -- if a GPU is being used-- call cudaSetDevice() and
+  // set up the cublas and cusparse handles.
+  bool initialized_;
+
+  // This variable is just a copy of the static variable device_id_.  It's used
+  // to detect when this code is called in the wrong way.
+  int32 device_id_copy_;
+
+  cublasHandle_t cublas_handle_;
+
+  cusparseHandle_t cusparse_handle_;
 
-  CuMemoryAllocator allocator_;
-  bool multi_threaded_;   // true if user called AllowMultithreading().
 }; // class CuDevice
 
 
@@ -214,13 +285,38 @@ class CuTimer: public Timer {
 
 // This function is declared as a more convenient way to get the CUDA device handle for use
 // in the CUBLAS v2 API, since we so frequently need to access it.
-inline cublasHandle_t GetCublasHandle() { return CuDevice::Instantiate().GetHandle(); }
+inline cublasHandle_t GetCublasHandle() { return CuDevice::Instantiate().GetCublasHandle(); }
 // A more convenient way to get the handle to use cuSPARSE APIs.
 inline cusparseHandle_t GetCusparseHandle() { return CuDevice::Instantiate().GetCusparseHandle(); }
 
-}  // namespace
+
+}  // namespace kaldi
 
 #endif // HAVE_CUDA
 
 
-#endif
+namespace kaldi {
+
+/**
+   The function SynchronizeGpu(), which for convenience is defined whether or
+   not we have compiled for CUDA, is intended to be called in places where threads
+   need to be synchronized.
+
+   It just launches a no-op kernel into the legacy default stream.  This will
+   have the effect that it will run after any kernels previously launched from
+   any stream(*), and before kernels that will later be launched from any stream(*).
+   (*) does not apply to non-blocking streams.
+
+   Note: at the time of writing we never call SynchronizeGpu() from binary-level
+   code because it hasn't become necessary yet; the only program that might have
+   multiple threads actually using the GPU is rnnlm-train (if the user were to
+   invoke it with the ,bg option for loading training examples); but the only
+   CUDA invocation the RnnlmExample::Read() function uses (via
+   CuMatrix::Read()), is cudaMemcpy, which is synchronous already.
+
+*/
+void SynchronizeGpu();
+
+}   // namespace kaldi
+
+#endif // KALDI_CUDAMATRIX_CU_DEVICE_H_
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index ebbcb9da5ff..a61bb601e8e 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -790,6 +790,10 @@ void cuda_uncompress_uint8(dim3 Gr, dim3 Bl, BaseFloat *dest,
                           MatrixDim dim, const uint8_t *src,
                           int src_stride, float scale);
 
+// Launches a kernel that does nothing, explicitly using the legacy default stream;
+// this will synchronize all CUDA streams (except for non-blocking streams) on the
+// device.
+void cuda_legacy_noop();
 
 
 } // extern "C"
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 4101d5ba52f..5a5307b9f87 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -3699,7 +3699,9 @@ static void _cuda_uncompress(BaseFloat *dest, MatrixDim dim,
   }
 }
 
-
+__global__
+static void _noop_kernel() {
+}
 
 /***********************************************************************
  * ANSI-C wrappers of CUDA kernels
@@ -5459,3 +5461,10 @@ void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest,
                            int src_stride, float scale) {
   _cuda_uncompress<<<Gr, Bl>>>(dest, dim, src, src_stride, scale);
 }
+
+
+// Launches a kernel that does nothing, explicitly using the legacy default stream;
+// this will synchronize all threads without blocking.
+void cuda_legacy_noop() {
+  _noop_kernel<<<1, 1, 0, cudaStreamLegacy>>>();
+}
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 01030bb8353..ba91f65e484 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -2620,7 +2620,11 @@ static int32 DoubleFactorial(int32 i) {
 template <typename Real>
 static void UnitTestCuMatrixSetRandn() {
 
-  { // First test consistency when called twice.
+
+  if (false) {
+    // This block tests consistency when called twice.
+    // It has been disabled since we added multi-threaded testing,
+    // since consistency wouldn't be expected if other threads were running.
     int32 dimM = 100 + Rand() % 200, dimN = 100 + Rand() % 200;
     Matrix<Real> M(dimM, dimN), N(dimM, dimN);
     srand(104);
@@ -3040,16 +3044,38 @@ template<typename Real> void CudaMatrixUnitTest() {
 int main() {
   SetVerboseLevel(1);
   int32 loop = 0;
+  bool test_threads = true;
+  // num_threads only matters if test_threads == true.   Don't make it
+  // to large, because it will affect CPU usage if you are using CPU.
+  int32 num_threads = 4;
+
+
 #if HAVE_CUDA == 1
   for (loop = 0; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
+    if (test_threads)
+      CuDevice::Instantiate().AllowMultithreading();
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
     else
       CuDevice::Instantiate().SelectGpuId("yes");
 #endif
 
-    kaldi::CudaMatrixUnitTest<float>();
+    if (test_threads) {
+      KALDI_LOG << "Doing matrix unit test with "
+                << num_threads << " threads.";
+      std::vector<std::thread*> threads;
+      for (int32 i = 0;  i < num_threads - 1; i++)
+        threads.push_back(new std::thread(kaldi::CudaMatrixUnitTest<float>));
+      // the last thread running is the main thread.
+      kaldi::CudaMatrixUnitTest<float>();
+      for (size_t i = 0; i < threads.size(); i++) {
+        threads[i]->join();
+        delete threads[i];
+      }
+    } else {
+      kaldi::CudaMatrixUnitTest<float>();
+    }
 
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().DoublePrecisionSupported()) {
diff --git a/src/fstbin/Makefile b/src/fstbin/Makefile
index 39e4ae39bcc..644eb639381 100644
--- a/src/fstbin/Makefile
+++ b/src/fstbin/Makefile
@@ -15,7 +15,7 @@ BINFILES = fstdeterminizestar  \
            fstmakecontextsyms fstaddsubsequentialloop fstaddselfloops  \
            fstrmepslocal fstcomposecontext fsttablecompose fstrand \
            fstdeterminizelog fstphicompose fstcopy \
-           fstpushspecial fsts-to-transcripts fsts-project fsts-union
+           fstpushspecial fsts-to-transcripts fsts-project fsts-union fsts-concat
 
 OBJFILES =
 
diff --git a/src/fstbin/fsts-concat.cc b/src/fstbin/fsts-concat.cc
new file mode 100644
index 00000000000..2a217eda7dc
--- /dev/null
+++ b/src/fstbin/fsts-concat.cc
@@ -0,0 +1,112 @@
+// fstbin/fsts-concat.cc
+
+// Copyright 2016  Johns Hopkins University (Authors: Jan "Yenda" Trmal)
+//           2018  Soapbox Labs (Author: Karel Vesely)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-utils.h"
+#include "fstext/kaldi-fst-io.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace fst;
+    typedef kaldi::int32 int32;
+    typedef kaldi::uint64 uint64;
+
+    const char *usage =
+        "Reads kaldi archives with FSTs. Concatenates the fsts from all the rspecifiers.\n"
+        "The fsts to concatenate must have same key. The sequencing is given by the position of arguments.\n"
+        "\n"
+        "Usage: fsts-concat [options] <fsts-rspecifier1> <fsts-rspecifier2> ... <fsts-wspecifier>\n"
+        " e.g.: fsts-concat scp:fsts1.scp scp:fsts2.scp ... ark:fsts_out.ark\n"
+        "\n"
+        "see also: fstconcat (from the OpenFst toolkit)\n";
+
+    ParseOptions po(usage);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string fsts_rspecifier = po.GetArg(1),
+        fsts_wspecifier = po.GetArg(po.NumArgs());
+
+    SequentialTableReader<VectorFstHolder> fst_reader(fsts_rspecifier);
+    std::vector<RandomAccessTableReader<VectorFstHolder>*> fst_readers;
+    TableWriter<VectorFstHolder> fst_writer(fsts_wspecifier);
+
+    for (int32 i = 2; i < po.NumArgs(); i++)
+      fst_readers.push_back(new RandomAccessTableReader<VectorFstHolder>(po.GetArg(i)));
+    const int32 num_fst_readers = fst_readers.size();
+
+    int32 n_done = 0,
+          n_skipped = 0;
+
+    for (; !fst_reader.Done(); fst_reader.Next()) {
+      std::string key = fst_reader.Key();
+
+      // Check that the key exists in all 'fst_readers'.
+      bool skip_key = false;
+      for (int32 i = 0; i < num_fst_readers; i++) {
+        if (!fst_readers[i]->HasKey(key)) {
+          KALDI_WARN << "Skipping '" << key << "'"
+            << " due to missing the fst in " << (i+2) << "th <rspecifier> : "
+            << "'" << po.GetArg(i+2) << "'";
+          skip_key = true;
+        }
+      }
+      if (skip_key) {
+        n_skipped++;
+        continue;
+      }
+
+      // Concatenate!
+      VectorFst<StdArc> fst_out = fst_readers.back()->Value(key);
+      // Loop from (last-1) to first, as 'prepending' the fsts is faster,
+      // see: http://www.openfst.org/twiki/bin/view/FST/ConcatDoc
+      for (int32 i = num_fst_readers-2; i >= 0; i--) {
+        fst::Concat(fst_readers[i]->Value(key), &fst_out);
+      }
+      // Finally, prepend the fst from the 'Sequential' reader.
+      fst::Concat(fst_reader.Value(), &fst_out);
+
+      // Write the output.
+      fst_writer.Write(key, fst_out);
+      n_done++;
+    }
+
+    // Cleanup.
+    for (int32 i = 0; i < num_fst_readers; i++)
+      delete fst_readers[i];
+    fst_readers.clear();
+
+    KALDI_LOG << "Produced " << n_done << " FSTs by concatenating " << po.NumArgs()-1
+      << " streams " << "(" << n_skipped << " keys skipped).";
+    return (n_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/ivectorbin/ivector-compute-plda.cc b/src/ivectorbin/ivector-compute-plda.cc
index 2e874adcca6..c955f07bd68 100644
--- a/src/ivectorbin/ivector-compute-plda.cc
+++ b/src/ivectorbin/ivector-compute-plda.cc
@@ -85,6 +85,7 @@ int main(int argc, char *argv[]) {
           num_utt_done++;
         }
       }
+
       if (ivectors.size() == 0) {
         KALDI_WARN << "Not producing output for speaker " << spk
                    << " since no utterances had iVectors";
@@ -101,6 +102,10 @@ int main(int argc, char *argv[]) {
       }
     }
 
+    if (num_utt_done <= plda_stats.Dim())
+      KALDI_ERR << "Number of training iVectors is not greater than their "
+                << "dimension, unable to estimate PLDA.";
+
     KALDI_LOG << "Accumulated stats from " << num_spk_done << " speakers ("
               << num_spk_err << " with no utterances), consisting of "
               << num_utt_done << " utterances (" << num_utt_err
diff --git a/src/ivectorbin/ivector-plda-scoring-dense.cc b/src/ivectorbin/ivector-plda-scoring-dense.cc
index 076fd41ad09..73ca879e6bc 100644
--- a/src/ivectorbin/ivector-plda-scoring-dense.cc
+++ b/src/ivectorbin/ivector-plda-scoring-dense.cc
@@ -194,8 +194,8 @@ int main(int argc, char *argv[]) {
         } else {
           KALDI_WARN << "Unable to compute conversation dependent PCA for"
             << " recording " << reco << ".";
-          ivector_mat_pca.Resize(ivector_mat.NumRows(), ivector_mat.NumCols());
-          ivector_mat_pca.CopyFromMat(ivector_mat);
+          TransformIvectors(ivector_mat, plda_config, this_plda,
+          &ivector_mat_plda);
         }
         for (int32 i = 0; i < ivector_mat_plda.NumRows(); i++) {
           for (int32 j = 0; j < ivector_mat_plda.NumRows(); j++) {
diff --git a/src/makefiles/cuda_32bit.mk b/src/makefiles/cuda_32bit.mk
index 9a343d1ae24..f6ddfb6d80f 100644
--- a/src/makefiles/cuda_32bit.mk
+++ b/src/makefiles/cuda_32bit.mk
@@ -7,7 +7,8 @@ endif
 
 CUDA_INCLUDE= -I$(CUDATKDIR)/include
 CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 32 -DHAVE_CUDA \
-             -ccbin $(CXX) -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION)
+             -ccbin $(CXX) -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+             -DCUDA_API_PER_THREAD_DEFAULT_STREAM
 CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include
 LDFLAGS += -L$(CUDATKDIR)/lib -Wl,-rpath=$(CUDATKDIR)/lib
 LDLIBS += -lcublas -lcusparse -lcudart -lcurand #LDLIBS : The libs are loaded later than static libs in implicit rule
diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk
index be76798b1d3..6a428e7391f 100644
--- a/src/makefiles/cuda_64bit.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -7,7 +7,8 @@ endif
 
 CUDA_INCLUDE= -I$(CUDATKDIR)/include
 CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 64 -DHAVE_CUDA \
-             -ccbin $(CXX) -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION)
+             -ccbin $(CXX) -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+             -DCUDA_API_PER_THREAD_DEFAULT_STREAM
 CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
 CUDA_LDLIBS += -lcublas -lcusparse -lcudart -lcurand #LDLIBS : The libs are loaded later than static libs in implicit rule
diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc
index b5740053f46..0677e1ca474 100644
--- a/src/nnet3/natural-gradient-online.cc
+++ b/src/nnet3/natural-gradient-online.cc
@@ -28,7 +28,7 @@ OnlineNaturalGradient::OnlineNaturalGradient():
     rank_(40), update_period_(1), num_samples_history_(2000.0),
     num_minibatches_history_(0.0), alpha_(4.0),
     epsilon_(1.0e-10), delta_(5.0e-04), frozen_(false), t_(0),
-    self_debug_(false) { }
+    self_debug_(false), rho_t_(-1.0e+10) { }
 
 
 /**
@@ -623,6 +623,21 @@ void OnlineNaturalGradient::SetAlpha(BaseFloat alpha) {
   alpha_ = alpha;
 }
 
-
-}
+void OnlineNaturalGradient::Swap(OnlineNaturalGradient *other) {
+  std::swap(rank_, other->rank_);
+  std::swap(update_period_, other->update_period_);
+  std::swap(num_samples_history_, other->num_samples_history_);
+  std::swap(num_minibatches_history_, other->num_minibatches_history_);
+  std::swap(alpha_, other->alpha_);
+  std::swap(epsilon_, other->epsilon_);
+  std::swap(delta_, other->delta_);
+  std::swap(frozen_, other->frozen_);
+  std::swap(t_, other->t_);
+  std::swap(self_debug_, other->self_debug_);
+  W_t_.Swap(&(other->W_t_));
+  std::swap(rho_t_, other->rho_t_);
+  d_t_.Swap(&(other->d_t_));
 }
+
+}  // namespace nnet3
+}  // namespace kaldi
diff --git a/src/nnet3/natural-gradient-online.h b/src/nnet3/natural-gradient-online.h
index b49769da540..a68ad9bbb53 100644
--- a/src/nnet3/natural-gradient-online.h
+++ b/src/nnet3/natural-gradient-online.h
@@ -466,6 +466,9 @@ class OnlineNaturalGradient {
   explicit OnlineNaturalGradient(const OnlineNaturalGradient &other);
   // Assignent operator
   OnlineNaturalGradient &operator = (const OnlineNaturalGradient &other);
+
+  // Shallow swap
+  void Swap(OnlineNaturalGradient *other);
  private:
 
 
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 2ec2699ec97..87eacf75327 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -88,7 +88,10 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
   } else { // conventional training
     TrainInternal(chain_eg, *computation);
   }
-
+  if (num_minibatches_processed_ == 0) {
+    ConsolidateMemory(nnet_);
+    ConsolidateMemory(delta_nnet_);
+  }
   num_minibatches_processed_++;
 }
 
diff --git a/src/nnet3/nnet-common.cc b/src/nnet3/nnet-common.cc
index 31ff9819dfa..e4bd1a402eb 100644
--- a/src/nnet3/nnet-common.cc
+++ b/src/nnet3/nnet-common.cc
@@ -392,11 +392,12 @@ size_t IndexVectorHasher::operator () (
                             // skipping over more elements.  Setting n1 large or
                             // n2 to 1 would make the hasher consider all
                             // elements.
+  size_t len = index_vector.size();
   // all long-ish numbers appearing below are randomly chosen primes.
-  size_t ans = 1433 + 34949  * index_vector.size();
+  size_t ans = 1433 + 34949 * len;
   std::vector<Index>::const_iterator iter = index_vector.begin(),
       end = index_vector.end(), med = end;
-  if (med > iter + n1)
+  if (n1 < len)
     med = iter + n1;
 
   for (; iter != med; ++iter) {
@@ -412,6 +413,10 @@ size_t IndexVectorHasher::operator () (
     ans += iter->n * 1619;
     ans += iter->t * 15649;
     ans += iter->x * 89809;
+    // The following if-statement was introduced in order to fix an
+    // out-of-range iterator problem on Windows.
+    if (n2 > len || iter >= end - n2) 
+        break;
   }
   return ans;
 }
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index 2c76805f5cc..d2d325d22f1 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -367,8 +367,10 @@ void NonlinearComponent::StoreStatsInternal(
 
 void NonlinearComponent::StoreBackpropStats(
     const CuMatrixBase<BaseFloat> &out_deriv) {
-  // only store these stats about every 4 minibatches.
-  if (RandInt(0, 3) == 0)
+  // Only store these stats about every 4 minibatches.  Make sure to always
+  // store the stats on the very first minibatch, or it would interact badly
+  // with the ConsolidateMemory() code.
+  if (RandInt(0, 3) == 0 && oderiv_count_ != 0)
     return;
 
   KALDI_ASSERT(out_deriv.NumCols() == dim_);
@@ -622,7 +624,11 @@ void NonlinearComponent::InitFromConfig(ConfigLine *cfl) {
               << Type() << ": \"" << cfl->WholeLine() << "\"";
 }
 
-
+void NonlinearComponent::ConsolidateMemory() {
+  { CuVector<double> temp(value_sum_); value_sum_.Swap(&temp); }
+  { CuVector<double> temp(deriv_sum_); deriv_sum_.Swap(&temp); }
+  { CuVector<double> temp(oderiv_sumsq_); oderiv_sumsq_.Swap(&temp); }
+}
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
index 01697353308..32d6b3d305d 100644
--- a/src/nnet3/nnet-component-itf.h
+++ b/src/nnet3/nnet-component-itf.h
@@ -375,6 +375,23 @@ class Component {
   /// backprop to consume it.
   virtual void DeleteMemo(void *memo) const { KALDI_ASSERT(memo == NULL); }
 
+  /// This virtual function relates to memory management, and avoiding
+  /// fragmentation.  It is called only once per model, after we do the first
+  /// minibatch of training.  The default implementation does nothing, but it
+  /// can be overridden by child classes, where it may re-initialize certain
+  /// quantities that may possibly have been allocated during the forward pass
+  /// (e.g. certain statistics; OnlineNaturalGradient objects).  We use our own
+  /// CPU-based allocator (see cu-allocator.h) and since it can't do paging
+  /// since we're not in control of the GPU page table, fragmentation can be a
+  /// problem.  The allocator always tries to put things in 'low-address memory'
+  /// (i.e. at smaller memory addresses) near the beginning of the block it
+  /// allocated, to avoid fragmentation; but if permanent things (belonging to
+  /// the model) are allocated in the forward pass, they can permanently stay in
+  /// high memory.  This function helps to prevent that, by re-allocating those
+  /// things into low-address memory (It's important that it's called after all the
+  /// temporary buffers for the forward-backward have been freed, so that there
+  /// is low-address memory available)).
+  virtual void ConsolidateMemory() { }
 
   Component() { }
 
@@ -620,6 +637,8 @@ class NonlinearComponent: public Component {
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
 
+  virtual void ConsolidateMemory();
+
   // The following functions are unique to NonlinearComponent.
   // They mostly relate to diagnostics.
   const CuVector<double> &ValueSum() const { return value_sum_; }
diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc
index f48a3968c88..7a1617f261a 100644
--- a/src/nnet3/nnet-convolutional-component.cc
+++ b/src/nnet3/nnet-convolutional-component.cc
@@ -665,6 +665,12 @@ void TimeHeightConvolutionComponent::PrecomputedIndexes::Read(
   ExpectToken(is, binary, "</TimeHeightConvolutionComponentPrecomputedIndexes>");
 }
 
+void TimeHeightConvolutionComponent::ConsolidateMemory() {
+  OnlineNaturalGradient temp_in(preconditioner_in_);
+  preconditioner_in_.Swap(&temp_in);
+  OnlineNaturalGradient temp_out(preconditioner_out_);
+  preconditioner_out_.Swap(&temp_out);
+}
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h
index e107962abc2..279cec321dd 100644
--- a/src/nnet3/nnet-convolutional-component.h
+++ b/src/nnet3/nnet-convolutional-component.h
@@ -300,6 +300,8 @@ class TimeHeightConvolutionComponent: public UpdatableComponent {
   };
 
   void ScaleLinearParams(BaseFloat alpha) { linear_params_.Scale(alpha); }
+
+  void ConsolidateMemory();
  private:
 
   void Check() const;
@@ -556,6 +558,8 @@ class TdnnComponent: public UpdatableComponent {
   CuVector<BaseFloat> &BiasParams() { return bias_params_; }
 
   BaseFloat OrthonormalConstraint() const { return orthonormal_constraint_; }
+
+  void ConsolidateMemory();
  private:
 
   // This static function is a utility function that extracts a CuSubMatrix
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index 00a31fa897c..48cc0112368 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1385,7 +1385,10 @@ void ConstantComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
   output_.CopyFromVec(params);
 }
 
-
+void ConstantComponent::ConsolidateMemory() {
+  OnlineNaturalGradient temp(preconditioner_);
+  preconditioner_.Swap(&temp);
+}
 
 std::string DropoutMaskComponent::Info() const {
   std::ostringstream stream;
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index cff73a55b59..d2def5d6e7e 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -679,6 +679,8 @@ class ConstantComponent: public UpdatableComponent {
   virtual int32 NumParameters() const;
   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+
+  virtual void ConsolidateMemory();
  private:
 
   // the output value-- a vector.
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 69f8442a08a..7a5eb7017a3 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -417,8 +417,10 @@ void SigmoidComponent::RepairGradients(
 void SigmoidComponent::StoreStats(const CuMatrixBase<BaseFloat> &in_value,
                                   const CuMatrixBase<BaseFloat> &out_value,
                                   void *memo) {
-  // only store stats about every other minibatch.
-  if (RandInt(0, 1) == 0)
+  // Only store stats about every other minibatch (but on the first minibatch,
+  // always store it, which is necessary for the ConsolidateMemory() operation
+  // to work correctly.
+  if (RandInt(0, 1) == 0 && count_ != 0)
     return;
   // derivative of the nonlinearity is out_value * (1.0 - out_value);
   CuMatrix<BaseFloat> temp_deriv(out_value.NumRows(), out_value.NumCols(),
@@ -939,8 +941,10 @@ void TanhComponent::Backprop(const std::string &debug_info,
 void TanhComponent::StoreStats(const CuMatrixBase<BaseFloat> &in_value,
                                const CuMatrixBase<BaseFloat> &out_value,
                                void *memo) {
-  // only store stats about every other minibatch.
-  if (RandInt(0, 1) == 0)
+  // Only store stats about every other minibatch (but on the first minibatch,
+  // always store it, which is necessary for the ConsolidateMemory() operation
+  // to work correctly.
+  if (RandInt(0, 1) == 0 && count_ != 0)
     return;
   // derivative of the onlinearity is out_value * (1.0 - out_value);
   CuMatrix<BaseFloat> temp_deriv(out_value);
@@ -1073,8 +1077,10 @@ void RectifiedLinearComponent::StoreStats(
     const CuMatrixBase<BaseFloat> &in_value,
     const CuMatrixBase<BaseFloat> &out_value,
     void *memo) {
-  // only store stats about every other minibatch.
-  if (RandInt(0, 1) == 0)
+  // Only store stats about every other minibatch (but on the first minibatch,
+  // always store it, which is necessary for the ConsolidateMemory() operation
+  // to work correctly.
+  if (RandInt(0, 1) == 0 && count_ != 0)
     return;
   CuMatrix<BaseFloat> temp_deriv(out_value.NumRows(),
                                  out_value.NumCols(),
@@ -1637,6 +1643,12 @@ void NaturalGradientRepeatedAffineComponent::Update(
   bias_params_.AddVec(learning_rate_ * scale, bias_deriv);
 }
 
+void NaturalGradientRepeatedAffineComponent::ConsolidateMemory() {
+  OnlineNaturalGradient temp(preconditioner_in_);
+  preconditioner_in_.Swap(&temp);
+}
+
+
 BlockAffineComponent::BlockAffineComponent(const BlockAffineComponent &other) :
   UpdatableComponent(other),
   linear_params_(other.linear_params_),
@@ -2555,6 +2567,13 @@ void ScaleAndOffsetComponent::BackpropInternal(
   }
 }
 
+void ScaleAndOffsetComponent::ConsolidateMemory() {
+  OnlineNaturalGradient temp_scale(scale_preconditioner_);
+  scale_preconditioner_.Swap(&temp_scale);
+  OnlineNaturalGradient temp_offset(offset_preconditioner_);
+  offset_preconditioner_.Swap(&temp_offset);
+}
+
 
 std::string ConstantFunctionComponent::Info() const {
   std::ostringstream stream;
@@ -2744,7 +2763,10 @@ void ConstantFunctionComponent::UnVectorize(const VectorBase<BaseFloat> &params)
   output_.CopyFromVec(params);
 }
 
-
+void ConstantFunctionComponent::ConsolidateMemory() {
+  OnlineNaturalGradient temp(preconditioner_);
+  preconditioner_.Swap(&temp);
+}
 
 void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) {
   ReadUpdatableCommon(is, binary);  // Read the opening tag and learning rate
@@ -3017,12 +3039,17 @@ void NaturalGradientAffineComponent::Add(BaseFloat alpha, const Component &other
   bias_params_.AddVec(alpha, other->bias_params_);
 }
 
-/// virtual
 void NaturalGradientAffineComponent::FreezeNaturalGradient(bool freeze) {
   preconditioner_in_.Freeze(freeze);
   preconditioner_out_.Freeze(freeze);
 }
 
+void NaturalGradientAffineComponent::ConsolidateMemory() {
+  OnlineNaturalGradient temp_in(preconditioner_in_);
+  preconditioner_in_.Swap(&temp_in);
+  OnlineNaturalGradient temp_out(preconditioner_out_);
+  preconditioner_out_.Swap(&temp_out);
+}
 
 void LinearComponent::Read(std::istream &is, bool binary) {
   std::string token = ReadUpdatableCommon(is, binary);
@@ -3291,6 +3318,12 @@ void LinearComponent::FreezeNaturalGradient(bool freeze) {
   preconditioner_out_.Freeze(freeze);
 }
 
+void LinearComponent::ConsolidateMemory() {
+  OnlineNaturalGradient temp_in(preconditioner_in_);
+  preconditioner_in_.Swap(&temp_in);
+  OnlineNaturalGradient temp_out(preconditioner_out_);
+  preconditioner_out_.Swap(&temp_out);
+}
 
 std::string FixedAffineComponent::Info() const {
   std::ostringstream stream;
@@ -3900,11 +3933,15 @@ void NaturalGradientPerElementScaleComponent::Update(
   scales_.AddVec(1.0, delta_scales);
 }
 
-/// virtual
 void NaturalGradientPerElementScaleComponent::FreezeNaturalGradient(bool freeze) {
   preconditioner_.Freeze(freeze);
 }
 
+void NaturalGradientPerElementScaleComponent::ConsolidateMemory() {
+  OnlineNaturalGradient temp(preconditioner_);
+  preconditioner_.Swap(&temp);
+}
+
 // Constructors for the convolution component
 ConvolutionComponent::ConvolutionComponent():
     UpdatableComponent(),
@@ -5874,6 +5911,11 @@ void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
   }
 }
 
+void LstmNonlinearityComponent::ConsolidateMemory() {
+  OnlineNaturalGradient preconditioner_temp(preconditioner_);
+  preconditioner_.Swap(&preconditioner_);
+}
+
 SumBlockComponent::SumBlockComponent(const SumBlockComponent &other):
     input_dim_(other.input_dim_), output_dim_(other.output_dim_),
     scale_(other.scale_) { }
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index 12ae99d716b..11c60f8f352 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -650,6 +650,9 @@ class NaturalGradientRepeatedAffineComponent: public RepeatedAffineComponent {
   // Copy constructor
   explicit NaturalGradientRepeatedAffineComponent(
       const NaturalGradientRepeatedAffineComponent &other);
+
+  virtual void ConsolidateMemory();
+
  private:
   virtual void Update(
       const CuMatrixBase<BaseFloat> &in_value,
@@ -832,6 +835,9 @@ class NaturalGradientAffineComponent: public AffineComponent {
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
   virtual void FreezeNaturalGradient(bool freeze);
+
+  virtual void ConsolidateMemory();
+
   // copy constructor
   explicit NaturalGradientAffineComponent(
       const NaturalGradientAffineComponent &other);
@@ -955,6 +961,8 @@ class LinearComponent: public UpdatableComponent {
   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
   virtual void FreezeNaturalGradient(bool freeze);
+  virtual void ConsolidateMemory();
+
   // copy constructor
   explicit LinearComponent(const LinearComponent &other);
 
@@ -1715,6 +1723,7 @@ class ConstantFunctionComponent: public UpdatableComponent {
   virtual int32 NumParameters() const;
   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+  virtual void ConsolidateMemory();
  private:
   int32 input_dim_;
   // the output value-- a vector.
@@ -1783,6 +1792,8 @@ class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
             int32 rank, int32 update_period, BaseFloat num_samples_history,
             BaseFloat alpha);
 
+  void ConsolidateMemory();
+
  private:
   // unlike the NaturalGradientAffineComponent, there is only one dimension to
   // consider as the parameters are a vector not a matrix, so we only need one
@@ -1888,6 +1899,7 @@ class ScaleAndOffsetComponent: public UpdatableComponent {
   virtual int32 NumParameters() const { return 2 * scales_.Dim(); }
   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+  virtual void ConsolidateMemory();
 
 
   // copy constructor
@@ -2281,6 +2293,8 @@ class LstmNonlinearityComponent: public UpdatableComponent {
             BaseFloat sigmoid_self_repair_threshold,
             BaseFloat self_repair_scale);
 
+  virtual void ConsolidateMemory();
+
  private:
 
   // Initializes the natural-gradient object with the configuration we
diff --git a/src/nnet3/nnet-tdnn-component.cc b/src/nnet3/nnet-tdnn-component.cc
index 52ad1031a4c..c287ce303a6 100644
--- a/src/nnet3/nnet-tdnn-component.cc
+++ b/src/nnet3/nnet-tdnn-component.cc
@@ -694,6 +694,12 @@ void TdnnComponent::PrecomputedIndexes::Read(
   ExpectToken(is, binary, "</TdnnComponentPrecomputedIndexes>");
 }
 
+void TdnnComponent::ConsolidateMemory() {
+  OnlineNaturalGradient temp_in(preconditioner_in_);
+  preconditioner_in_.Swap(&temp_in);
+  OnlineNaturalGradient temp_out(preconditioner_out_);
+  preconditioner_out_.Swap(&temp_out);
+}
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 8fda24cd22d..0acaa5c2008 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -82,8 +82,12 @@ void NnetTrainer::Train(const NnetExample &eg) {
   } else { // conventional training
     TrainInternal(eg, *computation);
   }
-
+  if (num_minibatches_processed_ == 0) {
+    ConsolidateMemory(nnet_);
+    ConsolidateMemory(delta_nnet_);
+  }
   num_minibatches_processed_++;
+
 }
 
 void NnetTrainer::TrainInternal(const NnetExample &eg,
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index d16a728e2ab..e020f8fc6a7 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -1058,6 +1058,27 @@ void ConstrainOrthonormal(Nnet *nnet) {
   }
 }
 
+void ConsolidateMemory(Nnet *nnet) {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    bool print_memory_info = (GetVerboseLevel() >= 1);
+    if (print_memory_info) {
+      KALDI_VLOG(1) << "Consolidating memory; will print memory usage before "
+          "and after consolidating:";
+      g_cuda_allocator.PrintMemoryUsage();
+    }
+    for (int32 c = 0; c < nnet->NumComponents(); c++) {
+      Component *comp = nnet->GetComponent(c);
+      comp->ConsolidateMemory();
+    }
+    if (print_memory_info) {
+      g_cuda_allocator.PrintMemoryUsage();
+    }
+  }
+#endif
+}
+
+
 
 // This code has been broken out of ReadEditConfig as it's quite long.
 // It implements the internals of the edit directive 'reduce-rank'.
@@ -2065,7 +2086,7 @@ bool UpdateNnetWithMaxChange(const Nnet &delta_nnet,
       ostr << "Per-component max-change active on "
            << num_max_change_per_component_applied_per_minibatch
            << " / " << num_updatable << " Updatable Components."
-           << "(smallest factor=" << min_scale << " on "
+           << " (Smallest factor=" << min_scale << " on "
            << component_name_with_min_scale
            << " with max-change=" << max_change_with_min_scale <<"). ";
     if (param_delta > max_param_change * max_change_scale)
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index c54fcf87e64..787bd228a38 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -476,6 +476,19 @@ void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale,
  */
 void ConstrainOrthonormal(Nnet *nnet);
 
+
+/**
+   This just calls ConsolidateMemory() on all the components of the nnet.  This
+   is called by the training code after processing the first minibatch.  On some
+   components this will do nothing; on some components it will reallocate
+   certain quantities that have been allocated during training (mostly the
+   contents of NaturalGradientOnline objects, and stats for NonlinearComponents)
+   so that they can be put into low memory.  This will tend to minimize
+   memory fragmentation.  Read comments in ../cudamatrix/cu-allocator.h for
+   more explanation.
+ */
+void ConsolidateMemory(Nnet *nnet);
+
 /** This utility function can be used to obtain the number of distinct 'n'
     values in a training example.  This is the number of examples
     (e.g. sequences) that have been combined into a single example.  (Actually
diff --git a/src/nnet3bin/nnet3-train.cc b/src/nnet3bin/nnet3-train.cc
index 271af5d06dc..d3fbaa587e1 100644
--- a/src/nnet3bin/nnet3-train.cc
+++ b/src/nnet3bin/nnet3-train.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "nnet3/nnet-training.h"
-
+#include "cudamatrix/cu-allocator.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -53,6 +53,7 @@ int main(int argc, char *argv[]) {
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
 
     train_config.Register(&po);
+    RegisterCuAllocatorOptions(&po);
 
     po.Read(argc, argv);
 
@@ -94,5 +95,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/rnnlm/rnnlm-core-training.cc b/src/rnnlm/rnnlm-core-training.cc
index 63a6dee188d..5a1ae97895f 100644
--- a/src/rnnlm/rnnlm-core-training.cc
+++ b/src/rnnlm/rnnlm-core-training.cc
@@ -343,6 +343,11 @@ void RnnlmCoreTrainer::ProcessOutput(
   computer->AcceptInput("output", &output_deriv);
 }
 
+void RnnlmCoreTrainer::ConsolidateMemory() {
+  kaldi::nnet3::ConsolidateMemory(nnet_);
+  kaldi::nnet3::ConsolidateMemory(delta_nnet_);
+}
+
 RnnlmCoreTrainer::~RnnlmCoreTrainer() {
   PrintMaxChangeStats();
   // Note: the objective-function stats are printed out in the destructor of the
diff --git a/src/rnnlm/rnnlm-core-training.h b/src/rnnlm/rnnlm-core-training.h
index 8f5ce873ff1..dd5fcfebd95 100644
--- a/src/rnnlm/rnnlm-core-training.h
+++ b/src/rnnlm/rnnlm-core-training.h
@@ -189,6 +189,10 @@ class RnnlmCoreTrainer {
   // per-component max-change and global max-change were enforced.
   void PrintMaxChangeStats() const;
 
+
+  // Calls ConsolidateMemory() on nnet_ and delta_nnet_.
+  void ConsolidateMemory();
+
   ~RnnlmCoreTrainer();
  private:
 
diff --git a/src/rnnlm/rnnlm-embedding-training.cc b/src/rnnlm/rnnlm-embedding-training.cc
index 4c42bd4ab39..c4238c7356a 100644
--- a/src/rnnlm/rnnlm-embedding-training.cc
+++ b/src/rnnlm/rnnlm-embedding-training.cc
@@ -175,7 +175,7 @@ void RnnlmEmbeddingTrainer::Train(
   if (config_.l2_regularize > 0.0) {
     BaseFloat l2_term = -2 * config_.l2_regularize;
     if (l2_term != 0.0) {
-      embedding_deriv->AddToRows(l2_term, active_words, embedding_mat_);
+      embedding_deriv->AddRows(l2_term, *embedding_mat_, active_words);
     }
   }
   BaseFloat scale = 1.0;
@@ -229,8 +229,8 @@ void RnnlmEmbeddingTrainer::TrainBackstitch(
   if (config_.l2_regularize > 0.0 && !is_backstitch_step1) {
     BaseFloat l2_term = -2 * config_.l2_regularize;
     if (l2_term != 0.0) {
-      embedding_deriv->AddMat(1.0 / (1.0 + config_.backstitch_training_scale) *
-          l2_term, *embedding_mat_);
+      embedding_deriv->AddRows(l2_term / (1.0 + config_.backstitch_training_scale),
+                               *embedding_mat_, active_words);
     }
   } 
   BaseFloat scale = 1.0;
diff --git a/src/rnnlm/rnnlm-example-test.cc b/src/rnnlm/rnnlm-example-test.cc
index 8b393acf4ff..ccfdd90bbea 100644
--- a/src/rnnlm/rnnlm-example-test.cc
+++ b/src/rnnlm/rnnlm-example-test.cc
@@ -305,6 +305,8 @@ int main() {
   SetVerboseLevel(4);
   CuDevice::Instantiate().PrintProfile();
 #endif
+
+  unlink("tmp.ark");
   return 0;
 }
 
diff --git a/src/rnnlm/rnnlm-training.cc b/src/rnnlm/rnnlm-training.cc
index 370f6395dc0..6db4d6f05b4 100644
--- a/src/rnnlm/rnnlm-training.cc
+++ b/src/rnnlm/rnnlm-training.cc
@@ -110,6 +110,9 @@ void RnnlmTrainer::Train(RnnlmExample *minibatch) {
   active_word_features_trans_.Swap(&active_word_features_trans);
 
   TrainInternal();
+
+  if (num_minibatches_processed_ == 1)
+    core_trainer_->ConsolidateMemory();
 }
 
 
diff --git a/src/rnnlmbin/rnnlm-train.cc b/src/rnnlmbin/rnnlm-train.cc
index 6a212dd4aad..d9107e310f5 100644
--- a/src/rnnlmbin/rnnlm-train.cc
+++ b/src/rnnlmbin/rnnlm-train.cc
@@ -22,7 +22,7 @@
 #include "rnnlm/rnnlm-training.h"
 #include "rnnlm/rnnlm-example-utils.h"
 #include "nnet3/nnet-utils.h"
-
+#include "cudamatrix/cu-allocator.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -93,6 +93,7 @@ int main(int argc, char *argv[]) {
 
 
     objective_config.Register(&po);
+    RegisterCuAllocatorOptions(&po);
 
     // register the core RNNLM training options options with the prefix "rnnlm",
     // so they will appear as --rnnlm.max-change and the like.  This is done
diff --git a/src/tree/context-dep.cc b/src/tree/context-dep.cc
index 4eab67f52be..5583717633c 100644
--- a/src/tree/context-dep.cc
+++ b/src/tree/context-dep.cc
@@ -319,8 +319,8 @@ void ContextDependency::GetPdfInfo(
 
 
 ContextDependency*
-MonophoneContextDependency(const std::vector<int32> phones,
-                           const std::vector<int32> phone2num_pdf_classes) {
+MonophoneContextDependency(const std::vector<int32> &phones,
+                           const std::vector<int32> &phone2num_pdf_classes) {
   std::vector<std::vector<int32> > phone_sets(phones.size());
   for (size_t i = 0; i < phones.size(); i++) phone_sets[i].push_back(phones[i]);
   std::vector<bool> share_roots(phones.size(), false);  // don't share roots.
@@ -331,8 +331,8 @@ MonophoneContextDependency(const std::vector<int32> phones,
 }
 
 ContextDependency*
-MonophoneContextDependencyShared(const std::vector<std::vector<int32> > phone_sets,
-                                 const std::vector<int32> phone2num_pdf_classes) {
+MonophoneContextDependencyShared(const std::vector<std::vector<int32> > &phone_sets,
+                                 const std::vector<int32> &phone2num_pdf_classes) {
   std::vector<bool> share_roots(phone_sets.size(), false);  // don't share roots.
   // N is context size, P = position of central phone (must be 0).
   int32 num_leaves = 0, P = 0, N = 1;
diff --git a/src/tree/context-dep.h b/src/tree/context-dep.h
index 6342d89667b..e69c26f8638 100644
--- a/src/tree/context-dep.h
+++ b/src/tree/context-dep.h
@@ -180,15 +180,15 @@ ContextDependency *GenRandContextDependencyLarge(const std::vector<int32> &phone
 // 0, 1, 2).
 
 ContextDependency*
-MonophoneContextDependency(const std::vector<int32> phones,
-                           const std::vector<int32> phone2num_pdf_classes);
+MonophoneContextDependency(const std::vector<int32> &phones,
+                           const std::vector<int32> &phone2num_pdf_classes);
 
 // MonophoneContextDependencyShared is as MonophoneContextDependency but lets
 // you define classes of phones which share pdfs (e.g. different stress-markers of a single
 // phone.)  Each element of phone_classes is a set of phones that are in that class.
 ContextDependency*
-MonophoneContextDependencyShared(const std::vector<std::vector<int32> > phone_classes,
-                                 const std::vector<int32> phone2num_pdf_classes);
+MonophoneContextDependencyShared(const std::vector<std::vector<int32> > &phone_classes,
+                                 const std::vector<int32> &phone2num_pdf_classes);
 
 
 // Important note:
diff --git a/src/util/kaldi-thread-test.cc b/src/util/kaldi-thread-test.cc
index e1776859222..eb6b72d1ed4 100644
--- a/src/util/kaldi-thread-test.cc
+++ b/src/util/kaldi-thread-test.cc
@@ -128,6 +128,6 @@ void TestTaskSequencer() {
 int main() {
   using namespace kaldi;
   TestThreads();
-  for (int32 i = 0; i < 1000; i++)
+  for (int32 i = 0; i < 10; i++)
     TestTaskSequencer();
 }
diff --git a/tools/extras/check_dependencies.sh b/tools/extras/check_dependencies.sh
index cd9ec7f5c1e..dc3ad8fbe57 100755
--- a/tools/extras/check_dependencies.sh
+++ b/tools/extras/check_dependencies.sh
@@ -24,15 +24,15 @@ COMPILER_VER_INFO=$($CXX --version 2>/dev/null)
 case $COMPILER_VER_INFO in
   "")
     echo "$0: $CXX is not installed."
-    echo "$0: You need g++ >= 4.7, Apple Xcode >= 5.0 or clang >= 3.3."
+    echo "$0: You need g++ >= 4.8.3, Apple Xcode >= 5.0 or clang >= 3.3."
     status=1
     ;;
   "g++ "* )
     GCC_VER=$($CXX -dumpversion)
     GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
-    if [ $GCC_VER_NUM -lt 40700 ]; then
+    if [ $GCC_VER_NUM -lt 40803 ]; then
         echo "$0: $CXX (g++-$GCC_VER) is not supported."
-        echo "$0: You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+        echo "$0: You need g++ >= 4.8.3, Apple clang >= 5.0 or LLVM clang >= 3.3."
         status=1
     fi
     ;;
@@ -42,7 +42,7 @@ case $COMPILER_VER_INFO in
     CLANG_VER_NUM=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*clang-\([0-9]*\).*/\1/")
     if [ $CLANG_VER_NUM -lt 500 ]; then
         echo "$0: $CXX (Apple clang-$CLANG_VER) is not supported."
-        echo "$0: You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+        echo "$0: You need g++ >= 4.8.3, Apple clang >= 5.0 or LLVM clang >= 3.3."
         status=1
     fi
     ;;
@@ -51,7 +51,7 @@ case $COMPILER_VER_INFO in
     CLANG_VER_NUM=$(echo $CLANG_VER | sed 's/\./ /g' | xargs printf "%d%02d")
     if [ $CLANG_VER_NUM -lt 303 ]; then
         echo "$0: $CXX (LLVM clang-$CLANG_VER) is not supported."
-        echo "$0: You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+        echo "$0: You need g++ >= 4.8.3, Apple clang >= 5.0 or LLVM clang >= 3.3."
         status=1
     fi
     ;;
diff --git a/windows/INSTALL.md b/windows/INSTALL.md
index cd9c77b1776..c48f2908e98 100644
--- a/windows/INSTALL.md
+++ b/windows/INSTALL.md
@@ -6,10 +6,10 @@ For cygwin installation, see the instructions in `../INSTALL`.
 ## Notes
 
 * The recipes (in egs/) will not work. There is no commitment to support Windows.
-  The Windows port of Kaldi is targeted at experienced developers who want 
-  to program their own apps using the kaldi libraries and are able to do 
-  the troubleshooting on their own. 
-* These instructions are valid November 2017, 
+  The Windows port of Kaldi is targeted at experienced developers who want
+  to program their own apps using the kaldi libraries and are able to do
+  the troubleshooting on their own.
+* These instructions are valid November 2017,
   [Intel® MKL](https://software.intel.com/en-us/intel-mkl) and OpenBLAS are supported
 * ATLAS is not supported and I personally have no intention to work on supporting
   it, as it requires whole cygwin environment
@@ -19,7 +19,7 @@ For cygwin installation, see the instructions in `../INSTALL`.
   and we didn't test if the solutions work or not.
 * While the 32bit project files will still be generated, we don't really
   care if they work or not. They will be removed in the near future.
-* The build process was validated using MSVC2017. We do not support earlier 
+* The build process was validated using MSVC2017. We do not support earlier
   releases (i.e. MSVC2015 and older). The reason is the C++11 support is still
   very buggy in the MS compiler.
 * We support only openfst-1.6.5 for now.
@@ -36,7 +36,7 @@ For cygwin installation, see the instructions in `../INSTALL`.
 ## Compiling OpenFST
 
 Skip this section, if you have downloaded OpenFST project from https://github.com/kkm000/openfst.git and it already contains openfst.sln file in the root folder. If it is present you can directly open it with Visual Studio 17 and you do not need CMake.
-------------------------- 
+-------------------------
 For compilation of OpenFST, you will need CMake installed. Simply go to https://cmake.org/download/ and download and install.
 Then, in the command line, run the following commands. Be very careful about writing the commands verbatim!
 
@@ -45,7 +45,7 @@ Then, in the command line, run the following commands. Be very careful about wri
         $ mkdir build64
         $ cd build64
         $ cmake -G "Visual Studio 15 2017 Win64" ../
-        
+
 The last command will generate output looking similarly to this. Do not try to read too much into specific versions of the programs.
 
         -- The C compiler identification is MSVC 19.11.25547.0
@@ -73,20 +73,20 @@ The last command will generate output looking similarly to this. Do not try to r
         -- Generating done
         -- Build files have been written to: C:/Users/jtrmal/Documents/openfst/build64
 
-In the directory `build64`, find the file `openfst.sln` and open it using Visual Studio 17. 
-------------------------- 
+In the directory `build64`, find the file `openfst.sln` and open it using Visual Studio 17.
+-------------------------
 
-   **Switch the configuration to `debug|Win64` and build the solution.**
-   **Do the same for configuration `release|Win64`.**
+   **Switch the configuration to `Debug|x64` and build the solution.**
+   **Do the same for configuration `Release|x64`.**
 
  If either of the two won't build, you should stop here and start figuring what's different!
 
-## Compiling Kaldi   
-   
+## Compiling Kaldi
+
 1. Checkout Kaldi trunk, using [git](https://git-for-windows.github.io/) from https://github.com/kaldi-asr/kaldi.git
 
    Example:
-   
+
         $ git clone https://github.com/kaldi-asr/kaldi.git kaldi
 
 There are two options to use for BLAS (linear algebra): [Intel® MKL](https://software.intel.com/en-us/intel-mkl) and OpenBLAS. [Intel® MKL](https://software.intel.com/en-us/intel-mkl) is made by Intel and is optimised
@@ -124,7 +124,7 @@ for their processors. It isn't free, but you can get [Community Licensing for In
 4. Enter the `(kaldi)/windows` directory
 
     Example:
-    
+
          (kaldi)/$ cd windows
          (kaldi)/windows $ pwd
 
@@ -148,7 +148,7 @@ for their processors. It isn't free, but you can get [Community Licensing for In
          generate_solution.pl --vsver <default|vs2017|vs2015> [--enable-cuda] [--enable-openblas] [--enable-mkl]
 
     `--enable-mkl` is the default so you shouldn't need to use it. If `--enable-openblas` is passed it disables MKL support.
-    CUDA is disabled by default. The default Visual Studio version is 15.0 (Visual Studio 2017). 
+    CUDA is disabled by default. The default Visual Studio version is 15.0 (Visual Studio 2017).
     Please note that while we support generating the project for Visual Studio 2015, the C++11 support for that compiler
     is rather sub-par, i.e. it won't probably compile. When choosing Visual Studio 2015, you are on your own!
 
@@ -161,10 +161,10 @@ for their processors. It isn't free, but you can get [Community Licensing for In
          (kaldi)/windows$ generate_solution.pl --vsver vs2017 --enable-cuda --enable-openblas
 
 9. Run the script (kaldi)/windows/get_version.pl:
-        
+
         (kaldi)/windows$ get_version.pl
-  
-10. Open the generated solution that was created in a subfolder (kaldi)/kaldiwin_vs<version>_<blas-library> 
+
+10. Open the generated solution that was created in a subfolder (kaldi)/kaldiwin_vs<version>_<blas-library>
 	in the visual studio and switch to **Debug|x64** (or **Release|x64**) and build.
    Expect 10 projects to fail, majority of them will fail because of missing include `portaudio.h`. The tests will
    fail to compile too -- this is because of deficiency of the script generate_solution.pl. We might fix it
diff --git a/windows/variables.props.dev b/windows/variables.props.dev
index d797f2f2abf..9fb2457c99c 100644
--- a/windows/variables.props.dev
+++ b/windows/variables.props.dev
@@ -7,7 +7,7 @@
     <MKLDIR>C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl\</MKLDIR>
     <OPENBLASDIR>C:\Users\Yenda\Downloads\kaldi-svn\tools\OpenBLAS-v0.2.14-Win64-int32</OPENBLASDIR>
     <OPENFST>C:\Users\jtrmal\Documents\openfst\</OPENFST>
-    <OPENFSTLIB>C:\Users\jtrmal\Documents\openfst\build64\lib</OPENFSTLIB>
+    <OPENFSTLIB>C:\Users\jtrmal\Documents\openfst\build64</OPENFSTLIB>
     <!-- Do not modify anything after this line -->
   </PropertyGroup>
   <PropertyGroup />