diff --git a/egs/aspire/s5/local/run_asr_segmentation.sh b/egs/aspire/s5/local/run_asr_segmentation.sh
index 731b6721a78..de0a925a242 100755
--- a/egs/aspire/s5/local/run_asr_segmentation.sh
+++ b/egs/aspire/s5/local/run_asr_segmentation.sh
@@ -48,7 +48,6 @@ reco_nj=40
 
 # test options
 test_nj=30
-test_stage=1
 
 . ./cmd.sh
 if [ -f ./path.sh ]; then . ./path.sh; fi
@@ -85,12 +84,10 @@ if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \
   exit 1
 fi
 
-data_id=$(basename $data_dir)
 whole_data_dir=${data_dir}_whole
-targets_dir=exp/segmentation${affix}/${data_id}_whole_combined_targets_sub3
+whole_data_id=$(basename $whole_data_dir)
 
 rvb_data_dir=${whole_data_dir}_rvb_hires
-rvb_targets_dir=${targets_dir}_rvb
 
 if [ $stage -le 0 ]; then
   utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir
@@ -101,26 +98,15 @@ fi
 ###############################################################################
 if [ $stage -le 1 ]; then
   steps/make_mfcc.sh --nj $reco_nj --cmd "$train_cmd"  --write-utt2num-frames true \
-    $whole_data_dir exp/make_mfcc/${data_id}_whole
-  steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${data_id}_whole
+    $whole_data_dir exp/make_mfcc/${whole_data_id}
+  steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${whole_data_id}
   utils/fix_data_dir.sh $whole_data_dir
 fi
 
 ###############################################################################
-# Get feats for the manual segments
+# Prepare SAD targets for recordings
 ###############################################################################
-if [ $stage -le 2 ]; then
-  if [ ! -f ${data_dir}/segments ]; then
-    utils/data/get_segments_for_data.sh $data_dir > $data_dir/segments
-  fi
-  utils/data/subsegment_data_dir.sh $whole_data_dir ${data_dir}/segments ${data_dir}/tmp
-  cp $data_dir/tmp/feats.scp $data_dir
-
-  # Use recording as the "speaker". This is required by prepare_targets_gmm.sh script.
-  awk '{print $1" "$2}' $data_dir/segments > $data_dir/utt2spk
-  utils/utt2spk_to_spk2utt.pl $data_dir/utt2spk > $data_dir/spk2utt
-fi
-
+targets_dir=$dir/${whole_data_id}_combined_targets_sub3
 if [ $stage -le 3 ]; then
   steps/segmentation/prepare_targets_gmm.sh --stage $prepare_targets_stage \
     --train-cmd "$train_cmd" --decode-cmd "$decode_cmd" \
@@ -132,6 +118,7 @@ if [ $stage -le 3 ]; then
     $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir
 fi
 
+rvb_targets_dir=${targets_dir}_rvb
 if [ $stage -le 4 ]; then
   # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
   if [ ! -f rirs_noises.zip ]; then
@@ -164,30 +151,29 @@ if [ $stage -le 4 ]; then
 fi
 
 if [ $stage -le 5 ]; then
-  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj \
+  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $reco_nj \
     ${rvb_data_dir}
   steps/compute_cmvn_stats.sh ${rvb_data_dir}
   utils/fix_data_dir.sh $rvb_data_dir
 fi
 
 if [ $stage -le 6 ]; then
-    rvb_targets_dirs=()
-    for i in `seq 1 $num_data_reps`; do
-      steps/segmentation/copy_targets_dir.sh --utt-prefix "rev${i}_" \
-        $targets_dir ${targets_dir}_temp_$i || exit 1
-      rvb_targets_dirs+=(${targets_dir}_temp_$i)
-    done
-
-    steps/segmentation/combine_targets_dirs.sh \
-      $rvb_data_dir ${rvb_targets_dir} \
-      ${rvb_targets_dirs[@]} || exit 1;
-
-    rm -r ${rvb_targets_dirs[@]}
+  rvb_targets_dirs=()
+  for i in `seq 1 $num_data_reps`; do
+    steps/segmentation/copy_targets_dir.sh --utt-prefix "rev${i}_" \
+      $targets_dir ${targets_dir}_temp_$i || exit 1
+    rvb_targets_dirs+=(${targets_dir}_temp_$i)
+  done
+
+  steps/segmentation/combine_targets_dirs.sh \
+    $rvb_data_dir ${rvb_targets_dir} \
+    ${rvb_targets_dirs[@]} || exit 1;
+
+  rm -r ${rvb_targets_dirs[@]}
 fi
 
-sad_nnet_dir=exp/segmentation${affix}/tdnn_stats_asr_sad_1a
-#sad_nnet_dir=exp/segmentation${affix}/tdnn_lstm_asr_sad_1a
-#sad_opts="--extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3"
+
+sad_nnet_dir=$dir/tdnn_stats_asr_sad_1a
 
 if [ $stage -le 7 ]; then
   # Train a STATS-pooling network for SAD
@@ -216,6 +202,13 @@ fi
 
 chain_dir=exp/chain/tdnn_lstm_1a
 
+# The context options in "sad_opts" must match the options used to train the 
+# SAD network in "sad_nnet_dir"
+sad_opts="--extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3"
+
+# For LSTM SAD network, the options might be something like
+# sad_opts="--extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3"
+
 if [ $stage -le 9 ]; then
   # Use left and right context options that were used when training
   # the chain nnet
diff --git a/egs/babel/s5d/local/run_asr_segmentation.sh b/egs/babel/s5d/local/run_asr_segmentation.sh
index 7bfc3fd60ca..f70775526b6 100755
--- a/egs/babel/s5d/local/run_asr_segmentation.sh
+++ b/egs/babel/s5d/local/run_asr_segmentation.sh
@@ -35,11 +35,15 @@ merge_weights=1.0,0.1,0.5
 prepare_targets_stage=-10
 nstage=-10
 train_stage=-10
-test_stage=-10
 
 affix=_1a
 stage=-1
 nj=80
+reco_nj=40
+
+# test options
+test_nj=32
+test_stage=-10
 
 # Babel specific configuration. These two lines can be removed when adapting to other corpora.
 [ ! -f ./lang.conf ] && echo 'Language configuration does not exist! Use the configurations in conf/lang/* as a startup' && exit 1
@@ -63,14 +67,14 @@ garbage_phones="<oov> <vns>"
 silence_phones="<sss> SIL"
 
 for p in $garbage_phones; do 
-  for affix in "" "_B" "_E" "_I" "_S"; do
-    echo "$p$affix"
+  for a in "" "_B" "_E" "_I" "_S"; do
+    echo "$p$a"
   done
 done > $dir/garbage_phones.txt
 
 for p in $silence_phones; do 
-  for affix in "" "_B" "_E" "_I" "_S"; do
-    echo "$p$affix"
+  for a in "" "_B" "_E" "_I" "_S"; do
+    echo "$p$a"
   done
 done > $dir/silence_phones.txt
 
@@ -81,6 +85,7 @@ if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \
 fi
 
 whole_data_dir=${data_dir}_whole
+whole_data_id=$(basename $whole_data_dir)
 
 if [ $stage -le 0 ]; then
   utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir
@@ -91,39 +96,34 @@ fi
 ###############################################################################
 if [ $stage -le 1 ]; then
   if $use_pitch; then
-    steps/make_plp_pitch.sh --cmd "$train_cmd" --nj $nj --write-utt2num-frames true \
+    steps/make_plp_pitch.sh --cmd "$train_cmd" --nj $reco_nj --write-utt2num-frames true \
       ${whole_data_dir} || exit 1
   else
-    steps/make_plp.sh --cmd "$train_cmd" --nj $nj --write-utt2num-frames true \
+    steps/make_plp.sh --cmd "$train_cmd" --nj $reco_nj --write-utt2num-frames true \
       ${whole_data_dir} || exit 1
   fi
+  steps/compute_cmvn_stats.sh $whole_data_dir
+  utils/fix_data_dir.sh $whole_data_dir
 fi
 
 ###############################################################################
-# Get feats for the manual segments
+# Prepare SAD targets for recordings
 ###############################################################################
-if [ $stage -le 2 ]; then
-  if [ ! -f ${data_dir}/segments ]; then
-    utils/data/get_segments_for_data.sh $data_dir > $data_dir/segments
-  fi
-  utils/data/subsegment_data_dir.sh $whole_data_dir ${data_dir}/segments ${data_dir}/tmp
-  cp $data_dir/tmp/feats.scp $data_dir
-  awk '{print $1" "$2}' $data_dir/segments > $data_dir/utt2spk
-  utils/utt2spk_to_spk2utt.pl $data_dir/utt2spk > $data_dir/spk2utt
-fi
-
+targets_dir=$dir/${whole_data_id}_combined_targets_sub3
 if [ $stage -le 3 ]; then
   steps/segmentation/prepare_targets_gmm.sh --stage $prepare_targets_stage \
     --train-cmd "$train_cmd" --decode-cmd "$decode_cmd" \
-    --nj 80 --reco-nj 40 --lang-test $lang_test \
+    --nj $nj --reco-nj $reco_nj --lang-test $lang_test \
     --garbage-phones-list $dir/garbage_phones.txt \
     --silence-phones-list $dir/silence_phones.txt \
+    --merge-weights "$merge_weights" \
+    --graph-dir "$graph_dir" \
     $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir
 fi
 
 if [ $stage -le 4 ]; then
   utils/copy_data_dir.sh ${whole_data_dir} ${whole_data_dir}_hires_bp
-  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires_bp.conf --nj 40 \
+  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires_bp.conf --nj $reco_nj \
     ${whole_data_dir}_hires_bp
   steps/compute_cmvn_stats.sh ${whole_data_dir}_hires_bp
 fi
@@ -132,7 +132,7 @@ if [ $stage -le 5 ]; then
   # Train a TDNN-LSTM network for SAD
   local/segmentation/tuning/train_lstm_asr_sad_1a.sh \
     --stage $nstage --train-stage $train_stage \
-    --targets-dir $dir \
+    --targets-dir $targets_dir \
     --data-dir ${whole_data_dir}_hires_bp
 fi
 
@@ -147,7 +147,7 @@ if [ $stage -le 6 ]; then
   steps/segmentation/detect_speech_activity.sh \
     --extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 \
     --extra-left-context-initial 0 --extra-right-context-final 0 \
-    --nj 32 --acwt 0.3 --stage $test_stage \
+    --nj $test_nj --acwt 0.3 --stage $test_stage \
     data/dev10h.pem \
     exp/segmentation_1a/tdnn_lstm_asr_sad_1a \
     mfcc_hires_bp \
diff --git a/egs/chime5/s5/local/train_lms_srilm.sh b/egs/chime5/s5/local/train_lms_srilm.sh
index 8caa251fa35..5a1d56d24b3 100755
--- a/egs/chime5/s5/local/train_lms_srilm.sh
+++ b/egs/chime5/s5/local/train_lms_srilm.sh
@@ -99,7 +99,7 @@ fi
 # Kaldi transcript files contain Utterance_ID as the first word; remove it
 # We also have to avoid skewing the LM by incorporating  the same sentences
 # from different channels
-sed -e "s/\.CH.//" -e "s/_.\-./_/" $train_text | sort -u | \
+sed -e "s/\.CH.//" -e "s/_.\-./_/" -e "s/NOLOCATION\(\.[LR]\)*-//" -e "s/U[0-9][0-9]_//" $train_text | sort -u | \
   perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/train.txt
 if (($?)); then
     echo "Failed to create $tgtdir/train.txt from $train_text"
diff --git a/egs/swbd/s5c/local/chain/run_tdnn.sh b/egs/swbd/s5c/local/chain/run_tdnn.sh
index 4a39dfb66ac..2f050be93f2 120000
--- a/egs/swbd/s5c/local/chain/run_tdnn.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_7n.sh
\ No newline at end of file
+tuning/run_tdnn_7o.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
new file mode 100755
index 00000000000..753dfc632ba
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
@@ -0,0 +1,297 @@
+#!/bin/bash
+
+
+# 7o is as 7n but with a bunch of tuning changes affecting both the structure
+# and the learning rates/l2 regularization.  Structurally the main change is
+# that we also do splicing via an extra layer whose input and output are in the
+# "small" dim (256); this increases the left and right context.  We also change
+# the orthonormal-constraint to be "floating" meaning it doesn't constrain the
+# size of the matrix (the value orthonormal-constraint=-1 is interpreted
+# specially by the code), which means we can control how fast these constrained
+# layers learn layers via l2, just like the unconstrained layers.  Also the l2
+# values were increased and the learning rates were decreased; there are
+# more epochs (6->8); and the dimension of some of the layers (the ones that
+# are subsampled and which don't receive skip-splicing) was increased from
+# 1280 to 1536.  The config is a bit messy and I'd like to find a way to
+# encapsulate things a bit better; treat this as a work in progress.
+#
+#
+#
+# local/chain/compare_wer_general.sh --rt03 tdnn7n_sp tdnn7m26o_sp
+# System                tdnn7n_sp tdnn7m26j_sp
+# WER on train_dev(tg)      12.18     11.74
+# WER on train_dev(fg)      11.12     10.69
+# WER on eval2000(tg)        14.9      14.6
+# WER on eval2000(fg)        13.5      13.1
+# WER on rt03(tg)            18.4      17.5
+# WER on rt03(fg)            16.2      15.4
+# Final train prob         -0.077    -0.070
+# Final valid prob         -0.093    -0.084
+# Final train prob (xent)        -0.994    -0.883
+# Final valid prob (xent)       -1.0194   -0.9110
+# Num-parameters               20111396  22865188
+
+
+# exp/chain/tdnn7o_sp: num-iters=525 nj=3..16 num-params=22.9M dim=40+100->6034 combine=-0.074->-0.073 (over 7) xent:train/valid[348,524,final]=(-1.20,-0.884,-0.883/-1.24,-0.918,-0.911) logprob:train/valid[348,524,final]=(-0.100,-0.071,-0.070/-0.115,-0.086,-0.084)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7o
+suffix=
+$speed_perturb && suffix=_sp
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+dir=exp/chain/tdnn${affix}${suffix}
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1280
+  linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1280 input=Append(0,1)
+  linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1280 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280
+  linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1280
+  linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1280
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1536
+  linear-component name=prefinal-chain-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-chain-batchnorm
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1536
+  linear-component name=prefinal-xent-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-xent-batchnorm
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+#    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 8 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.0005 \
+    --trainer.optimization.final-effective-lrate 0.00005 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh
index af7d3a428ba..6da8b55631d 100755
--- a/egs/swbd/s5c/local/run_asr_segmentation.sh
+++ b/egs/swbd/s5c/local/run_asr_segmentation.sh
@@ -38,11 +38,15 @@ merge_weights=1.0,0.1,0.5
 prepare_targets_stage=-10
 nstage=-10
 train_stage=-10
-test_stage=-10
 num_data_reps=2
 affix=_1a   # For segmentation
 stage=-1
 nj=80
+reco_nj=40
+
+# test options
+test_stage=-10
+test_nj=32
 
 . ./cmd.sh
 if [ -f ./path.sh ]; then . ./path.sh; fi
@@ -79,12 +83,10 @@ if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \
   exit 1
 fi
 
-data_id=$(basename $data_dir)
 whole_data_dir=${data_dir}_whole
-targets_dir=exp/segmentation${affix}/${data_id}_whole_combined_targets_sub3
+whole_data_id=$(basename $whole_data_dir)
 
 rvb_data_dir=${whole_data_dir}_rvb_hires
-rvb_targets_dir=${targets_dir}_rvb
 
 if [ $stage -le 0 ]; then
   utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir
@@ -94,29 +96,20 @@ fi
 # Extract features for the whole data directory
 ###############################################################################
 if [ $stage -le 1 ]; then
-  steps/make_mfcc.sh --nj 50 --cmd "$train_cmd"  --write-utt2num-frames true \
-    $whole_data_dir exp/make_mfcc/${data_id}_whole
-  steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${data_id}_whole
+  steps/make_mfcc.sh --nj $reco_nj --cmd "$train_cmd"  --write-utt2num-frames true \
+    $whole_data_dir exp/make_mfcc/${whole_data_id}
+  steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${whole_data_id}
   utils/fix_data_dir.sh $whole_data_dir
 fi
 
 ###############################################################################
-# Get feats for the manual segments
+# Prepare SAD targets for recordings
 ###############################################################################
-if [ $stage -le 2 ]; then
-  if [ ! -f ${data_dir}/segments ]; then
-    utils/data/get_segments_for_data.sh $data_dir > $data_dir/segments
-  fi
-  utils/data/subsegment_data_dir.sh $whole_data_dir ${data_dir}/segments ${data_dir}/tmp
-  cp $data_dir/tmp/feats.scp $data_dir
-  awk '{print $1" "$2}' $data_dir/segments > $data_dir/utt2spk
-  utils/utt2spk_to_spk2utt.pl $data_dir/utt2spk > $data_dir/spk2utt
-fi
-
+targets_dir=$dir/${whole_data_id}_combined_targets_sub3
 if [ $stage -le 3 ]; then
   steps/segmentation/prepare_targets_gmm.sh --stage $prepare_targets_stage \
     --train-cmd "$train_cmd" --decode-cmd "$decode_cmd" \
-    --nj 80 --reco-nj 40 --lang-test $lang_test \
+    --nj $nj --reco-nj $reco_nj --lang-test $lang_test \
     --garbage-phones-list $dir/garbage_phones.txt \
     --silence-phones-list $dir/silence_phones.txt \
     --merge-weights "$merge_weights" \
@@ -124,6 +117,7 @@ if [ $stage -le 3 ]; then
     $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir
 fi
 
+rvb_targets_dir=${targets_dir}_rvb
 if [ $stage -le 4 ]; then
   # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
   if [ ! -f rirs_noises.zip ]; then
@@ -156,7 +150,7 @@ if [ $stage -le 4 ]; then
 fi
 
 if [ $stage -le 5 ]; then
-  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 80 \
+  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $reco_nj \
     ${rvb_data_dir}
   steps/compute_cmvn_stats.sh ${rvb_data_dir}
   utils/fix_data_dir.sh $rvb_data_dir
@@ -196,7 +190,7 @@ if [ $stage -le 8 ]; then
   steps/segmentation/detect_speech_activity.sh \
     --extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 \
     --extra-left-context-initial 0 --extra-right-context-final 0 \
-    --nj 32 --acwt 0.3 --stage $test_stage \
+    --nj $test_nj --acwt 0.3 --stage $test_stage \
     data/eval2000 \
     exp/segmentation${affix}/tdnn_stats_asr_sad_1a \
     mfcc_hires \
diff --git a/egs/vystadial_cz/s5b/RESULTS b/egs/vystadial_cz/s5b/RESULTS
new file mode 100644
index 00000000000..ec945059fc5
--- /dev/null
+++ b/egs/vystadial_cz/s5b/RESULTS
@@ -0,0 +1,17 @@
+
+# monophone system (shortest 10k)
+%WER 75.81 [ 8989 / 11858, 421 ins, 2691 del, 5877 sub ] exp/mono/decode_dev/wer_10_0.0
+# delta + delta-delta triphone system
+%WER 55.97 [ 6637 / 11858, 494 ins, 1664 del, 4479 sub ] exp/tri1/decode_dev/wer_14_0.5
+# LDA+MLLT system
+%WER 50.98 [ 6045 / 11858, 439 ins, 1564 del, 4042 sub ] exp/tri2b/decode_dev/wer_12_0.5
+# LDA+MLLT+SAT system
+%WER 51.76 [ 6138 / 11858, 627 ins, 1276 del, 4235 sub ] exp/tri3b/decode_dev/wer_12_0.0
+# LDA+MLLT+SAT system with silence probabilities 
+%WER 51.75 [ 6137 / 11858, 622 ins, 1282 del, 4233 sub ] exp/tri3b/decode_sp_dev/wer_12_0.5
+
+# chain tdnn system
+%WER 33.29 [ 3948 / 11858, 480 ins, 787 del, 2681 sub ] exp/chain/tdnn1a_sp/decode_dev/wer_9_1.0
+%WER 32.15 [ 3735 / 11617, 394 ins, 803 del, 2538 sub ] exp/chain/tdnn1a_sp/decode_test/wer_11_0.5
+%WER 33.20 [ 3937 / 11858, 514 ins, 734 del, 2689 sub ] exp/chain/tdnn1a_sp_online/decode_dev/wer_10_0.5
+%WER 32.04 [ 3722 / 11617, 451 ins, 723 del, 2548 sub ] exp/chain/tdnn1a_sp_online/decode_test/wer_10_0.5
diff --git a/egs/vystadial_cz/s5b/cmd.sh b/egs/vystadial_cz/s5b/cmd.sh
new file mode 100644
index 00000000000..71dd849a93b
--- /dev/null
+++ b/egs/vystadial_cz/s5b/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/vystadial_cz/s5b/conf/decode.config b/egs/vystadial_cz/s5b/conf/decode.config
new file mode 100644
index 00000000000..7ba966f2b83
--- /dev/null
+++ b/egs/vystadial_cz/s5b/conf/decode.config
@@ -0,0 +1 @@
+# empty config, just use the defaults.
diff --git a/egs/vystadial_cz/s5b/conf/mfcc.conf b/egs/vystadial_cz/s5b/conf/mfcc.conf
new file mode 100644
index 00000000000..7361509099f
--- /dev/null
+++ b/egs/vystadial_cz/s5b/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/vystadial_cz/s5b/conf/mfcc_hires.conf b/egs/vystadial_cz/s5b/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..434834a6725
--- /dev/null
+++ b/egs/vystadial_cz/s5b/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/vystadial_cz/s5b/conf/online_cmvn.conf b/egs/vystadial_cz/s5b/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/vystadial_cz/s5b/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/vystadial_cz/s5b/env_voip_cs.sh b/egs/vystadial_cz/s5b/env_voip_cs.sh
new file mode 120000
index 00000000000..7adc3c6960f
--- /dev/null
+++ b/egs/vystadial_cz/s5b/env_voip_cs.sh
@@ -0,0 +1 @@
+../s5/env_voip_cs.sh
\ No newline at end of file
diff --git a/egs/vystadial_cz/s5b/local/chain/compare_wer.sh b/egs/vystadial_cz/s5b/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..14ca1196e64
--- /dev/null
+++ b/egs/vystadial_cz/s5b/local/chain/compare_wer.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+
+# Copied from egs/mini_librispeech/s5/local/chain/compare_wer.sh (commit 421a062477d732fc02e2109b9d50857ae0f18661)
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+  "#WER dev"
+  "#WER test")
+
+for n in 0 1; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+    decode_names=(dev test)
+
+     wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#             [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Num-params              "
+for x in $*; do
+  printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}')
+done
+echo
diff --git a/egs/vystadial_cz/s5b/local/chain/run_tdnn.sh b/egs/vystadial_cz/s5b/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/vystadial_cz/s5b/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..496ee5e84ca
--- /dev/null
+++ b/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,293 @@
+#!/bin/bash
+
+# Adapted from egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
+
+# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1a_sp_online
+# System                tdnn1a_sp tdnn1a_sp_online
+#WER dev     33.29     33.20
+#WER test     32.15     32.04
+# Final train prob        -0.0988          
+# Final valid prob        -0.1913          
+# Final train prob (xent)   -1.6242          
+# Final valid prob (xent)   -1.9833          
+# Num-params                 6117328
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp
+# exp/chain/tdnn1a_sp: num-iters=38 nj=2..12 num-params=6.1M dim=40+100->2024 combine=-0.116->-0.115 (over 2) xent:train/valid[24,37,final]=(-1.89,-1.65,-1.62/-2.17,-2.01,-1.98) logprob:train/valid[24,37,final]=(-0.134,-0.105,-0.099/-0.206,-0.196,-0.191)
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train
+test_sets="dev test"
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang_sp/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang_sp/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang_sp $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang_sp $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.05 dropout-per-dim-continuous=true"
+  output_opts="l2-regularize=0.02 bottleneck-dim=192"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=512
+  relu-batchnorm-dropout-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=512
+  relu-batchnorm-dropout-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=512
+  relu-batchnorm-dropout-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain $opts dim=512
+  output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512
+  output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=15 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_sp_test \
+    $tree_dir $tree_dir/graph || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 1 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph data/${data}_hires ${dir}/decode_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph data/${data} ${dir}_online/decode_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/vystadial_cz/s5b/local/create_LMs.sh b/egs/vystadial_cz/s5b/local/create_LMs.sh
new file mode 120000
index 00000000000..5278b3ef2a3
--- /dev/null
+++ b/egs/vystadial_cz/s5b/local/create_LMs.sh
@@ -0,0 +1 @@
+../../s5/local/create_LMs.sh
\ No newline at end of file
diff --git a/egs/vystadial_cz/s5b/local/create_phone_lists.sh b/egs/vystadial_cz/s5b/local/create_phone_lists.sh
new file mode 120000
index 00000000000..6b629c3e9d4
--- /dev/null
+++ b/egs/vystadial_cz/s5b/local/create_phone_lists.sh
@@ -0,0 +1 @@
+../../s5/local/create_phone_lists.sh
\ No newline at end of file
diff --git a/egs/vystadial_cz/s5b/local/data_split.sh b/egs/vystadial_cz/s5b/local/data_split.sh
new file mode 120000
index 00000000000..52abffa5497
--- /dev/null
+++ b/egs/vystadial_cz/s5b/local/data_split.sh
@@ -0,0 +1 @@
+../../s5/local/data_split.sh
\ No newline at end of file
diff --git a/egs/vystadial_cz/s5b/local/download_cs_data.sh b/egs/vystadial_cz/s5b/local/download_cs_data.sh
new file mode 120000
index 00000000000..fa02e24f363
--- /dev/null
+++ b/egs/vystadial_cz/s5b/local/download_cs_data.sh
@@ -0,0 +1 @@
+../../s5/local/download_cs_data.sh
\ No newline at end of file
diff --git a/egs/vystadial_cz/s5b/local/nnet3/run_ivector_common.sh b/egs/vystadial_cz/s5b/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..beecd9a46c2
--- /dev/null
+++ b/egs/vystadial_cz/s5b/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train
+test_sets="dev test"
+gmm=tri3b
+
+nnet3_affix=
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang_sp $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=data/${train_set}_sp_hires/data
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  # We'll use about a quarter of the data.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads 8 \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+exit 0
diff --git a/egs/vystadial_cz/s5b/local/phonetic_transcription_cs.pl b/egs/vystadial_cz/s5b/local/phonetic_transcription_cs.pl
new file mode 120000
index 00000000000..bf91ed69bbd
--- /dev/null
+++ b/egs/vystadial_cz/s5b/local/phonetic_transcription_cs.pl
@@ -0,0 +1 @@
+../../s5/local/phonetic_transcription_cs.pl
\ No newline at end of file
diff --git a/egs/vystadial_cz/s5b/local/prepare_cs_transcription.sh b/egs/vystadial_cz/s5b/local/prepare_cs_transcription.sh
new file mode 120000
index 00000000000..de8f509efc3
--- /dev/null
+++ b/egs/vystadial_cz/s5b/local/prepare_cs_transcription.sh
@@ -0,0 +1 @@
+../../s5/local/prepare_cs_transcription.sh
\ No newline at end of file
diff --git a/egs/vystadial_cz/s5b/local/score.sh b/egs/vystadial_cz/s5b/local/score.sh
new file mode 120000
index 00000000000..0afefc3158c
--- /dev/null
+++ b/egs/vystadial_cz/s5b/local/score.sh
@@ -0,0 +1 @@
+../steps/score_kaldi.sh
\ No newline at end of file
diff --git a/egs/vystadial_cz/s5b/path.sh b/egs/vystadial_cz/s5b/path.sh
new file mode 100644
index 00000000000..0453ff9e5aa
--- /dev/null
+++ b/egs/vystadial_cz/s5b/path.sh
@@ -0,0 +1,12 @@
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+
+# SRILM is needed for LM model building
+SRILM_ROOT=$KALDI_ROOT/tools/srilm
+SRILM_PATH=$SRILM_ROOT/bin:$SRILM_ROOT/bin/i686-m64
+export PATH=$PATH:$SRILM_PATH
+
+export LC_ALL=C
+
diff --git a/egs/vystadial_cz/s5b/run.sh b/egs/vystadial_cz/s5b/run.sh
new file mode 100755
index 00000000000..f837b273466
--- /dev/null
+++ b/egs/vystadial_cz/s5b/run.sh
@@ -0,0 +1,171 @@
+#!/bin/bash
+
+# Change this location to somewhere where you want to put the data.
+data=$HOME/vystadial_cz
+
+# Load training parameters
+. ./env_voip_cs.sh
+
+. ./cmd.sh
+. ./path.sh
+
+stage=0
+. utils/parse_options.sh
+
+set -euo pipefail
+
+mkdir -p $data
+
+if [ $stage -le 0 ]; then
+  local/download_cs_data.sh $data || exit 1;
+fi
+
+lm="build3"
+
+if [ $stage -le 1 ]; then
+  local/data_split.sh --every_n 1 $data data "$lm" "dev test"
+
+  local/create_LMs.sh data/local data/train/trans.txt \
+    data/test/trans.txt data/local/lm "$lm"
+
+  gzip data/local/lm/$lm
+
+  local/prepare_cs_transcription.sh data/local data/local/dict
+
+  local/create_phone_lists.sh data/local/dict
+
+  utils/prepare_lang.sh data/local/dict '_SIL_' data/local/lang data/lang
+
+  utils/format_lm.sh data/lang data/local/lm/$lm.gz data/local/dict/lexicon.txt data/lang_test
+
+  for part in dev test train; do
+    mv data/$part/trans.txt data/$part/text
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  mfccdir=mfcc
+
+  for part in dev train; do
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/$part exp/make_mfcc/$part $mfccdir
+    steps/compute_cmvn_stats.sh data/$part exp/make_mfcc/$part $mfccdir
+  done
+
+  # Get the shortest 10000 utterances first because those are more likely
+  # to have accurate alignments.
+  utils/subset_data_dir.sh --shortest data/train 10000 data/train_10kshort
+fi
+
+# train a monophone system
+if [ $stage -le 3 ]; then
+  steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
+    data/train_10kshort data/lang exp/mono
+  (
+    utils/mkgraph.sh data/lang_test \
+      exp/mono exp/mono/graph
+    for test in dev; do
+      steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/mono/graph \
+        data/$test exp/mono/decode_$test
+    done
+  )&
+
+  steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
+    data/train data/lang exp/mono exp/mono_ali_train
+fi
+
+# train a first delta + delta-delta triphone system on all utterances
+if [ $stage -le 4 ]; then
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
+    2000 10000 data/train data/lang exp/mono_ali_train exp/tri1
+
+  # decode using the tri1 model
+  (
+    utils/mkgraph.sh data/lang_test \
+      exp/tri1 exp/tri1/graph
+    for test in dev; do
+      steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/tri1/graph \
+        data/$test exp/tri1/decode_$test
+    done
+  )&
+
+  steps/align_si.sh --nj 10 --cmd "$train_cmd" \
+    data/train data/lang exp/tri1 exp/tri1_ali_train
+fi
+
+# train an LDA+MLLT system.
+if [ $stage -le 5 ]; then
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
+    data/train data/lang exp/tri1_ali_train exp/tri2b
+
+  # decode using the LDA+MLLT model
+  (
+    utils/mkgraph.sh data/lang_test \
+      exp/tri2b exp/tri2b/graph
+    for test in dev; do
+      steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/tri2b/graph \
+        data/$test exp/tri2b/decode_$test
+    done
+  )&
+
+  # Align utts using the tri2b model
+  steps/align_si.sh  --nj 10 --cmd "$train_cmd" --use-graphs true \
+    data/train data/lang exp/tri2b exp/tri2b_ali_train
+fi
+
+# Train tri3b, which is LDA+MLLT+SAT
+if [ $stage -le 6 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \
+    data/train data/lang exp/tri2b_ali_train exp/tri3b
+
+  # decode using the tri3b model
+  (
+    utils/mkgraph.sh data/lang_test \
+      exp/tri3b exp/tri3b/graph
+    for test in dev; do
+      steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+        exp/tri3b/graph data/$test \
+        exp/tri3b/decode_$test
+    done
+  )&
+fi
+
+# Now we compute the pronunciation and silence probabilities from training data,
+# and re-create the lang directory.
+if [ $stage -le 7 ]; then
+  steps/get_prons.sh --cmd "$train_cmd" \
+    data/train data/lang exp/tri3b
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    data/local/dict \
+    exp/tri3b/pron_counts_nowb.txt exp/tri3b/sil_counts_nowb.txt \
+    exp/tri3b/pron_bigram_counts_nowb.txt data/local/dict_sp
+
+  utils/prepare_lang.sh data/local/dict_sp "_SIL_" data/local/lang_tmp data/lang_sp
+
+  utils/format_lm.sh data/lang_sp data/local/lm/$lm.gz data/local/dict_sp/lexicon.txt data/lang_sp_test
+
+  steps/align_fmllr.sh --nj 10 --cmd "$train_cmd" \
+    data/train data/lang_sp exp/tri3b exp/tri3b_ali_train_sp
+fi
+
+if [ $stage -le 8 ]; then
+  # Test the tri3b system with the silprobs and pron-probs.
+
+  # decode using the tri3b model
+  utils/mkgraph.sh data/lang_sp_test \
+    exp/tri3b exp/tri3b/graph_sp
+
+  for test in dev; do
+    steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+      exp/tri3b/graph_sp data/$test \
+      exp/tri3b/decode_sp_$test
+  done
+fi
+
+# Train a chain model
+if [ $stage -le 9 ]; then
+  local/chain/run_tdnn.sh --stage 0
+fi
+
+# Don't finish until all background decoding jobs are finished.
+wait
diff --git a/egs/vystadial_cz/s5b/steps b/egs/vystadial_cz/s5b/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/vystadial_cz/s5b/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/vystadial_cz/s5b/utils b/egs/vystadial_cz/s5b/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/vystadial_cz/s5b/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 4c6a37fb837..b20c64ab9ba 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -941,9 +941,10 @@ def __init__(self,
                                  action=common_lib.NullstrToNoneAction,
                                  help="Script to launch egs jobs")
         self.parser.add_argument("--use-gpu", type=str,
-                                 action=common_lib.StrToBoolAction,
-                                 choices=["true", "false"],
-                                 help="Use GPU for training", default=True)
+                                 choices=["true", "false", "yes", "no", "wait"],
+                                 help="Use GPU for training. "
+                                 "Note 'true' and 'false' are deprecated.",
+                                 default="yes")
         self.parser.add_argument("--cleanup", type=str,
                                  action=common_lib.StrToBoolAction,
                                  choices=["true", "false"], default=True,
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 6fbde1fbbcc..99911b39fb2 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -69,6 +69,7 @@
         'norm-pgru-layer' : xlayers.XconfigNormPgruLayer,
         'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer,
         'renorm-component': xlayers.XconfigRenormComponent,
+        'batchnorm-component': xlayers.XconfigBatchnormComponent,
         'no-op-component': xlayers.XconfigNoOpComponent,
         'linear-component': xlayers.XconfigLinearComponent
 }
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
index 63f6278d1ca..f7da8956d1c 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
@@ -16,7 +16,7 @@
 
 class XconfigRenormComponent(XconfigLayerBase):
     """This class is for parsing lines like
-     'renorm-component name=renorm input=Append(-3,0,3)'
+     'renorm-component name=renorm1 input=Append(-3,0,3)'
     which will produce just a single component, of type NormalizeComponent.
 
     Parameters of the class, and their defaults:
@@ -70,9 +70,65 @@ def _generate_config(self):
         return configs
 
 
+class XconfigBatchnormComponent(XconfigLayerBase):
+    """This class is for parsing lines like
+     'batchnorm-component name=batchnorm input=Append(-3,0,3)'
+    which will produce just a single component, of type BatchNormComponent.
+
+    Parameters of the class, and their defaults:
+      input='[-1]'             [Descriptor giving the input of the layer.]
+      target-rms=1.0           [The target RMS of the BatchNormComponent]
+    """
+    def __init__(self, first_token, key_to_value, prev_names=None):
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input': '[-1]',
+                       'target-rms': 1.0 }
+
+    def check_configs(self):
+        assert self.config['target-rms'] > 0.0
+
+    def output_name(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        return self.name
+
+    def output_dim(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        input_dim = self.descriptors['input']['dim']
+        return input_dim
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    def _generate_config(self):
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_desc = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        target_rms = self.config['target-rms']
+
+        configs = []
+        line = ('component name={0} type=BatchNormComponent dim={1} target-rms={2}'.format(
+            self.name, input_dim, target_rms))
+        configs.append(line)
+        line = ('component-node name={0} component={0} input={1}'.format(
+            self.name, input_desc))
+        configs.append(line)
+        return configs
+
+
 class XconfigNoOpComponent(XconfigLayerBase):
     """This class is for parsing lines like
-     'no-op-component name=renorm input=Append(-3,0,3)'
+     'no-op-component name=noop1 input=Append(-3,0,3)'
     which will produce just a single component, of type NoOpComponent.
 
     Parameters of the class, and their defaults:
@@ -127,7 +183,7 @@ class XconfigLinearComponent(XconfigLayerBase):
     """This class is for parsing lines like
      'linear-component name=linear1 dim=1024 input=Append(-3,0,3)'
     which will produce just a single component, of type LinearComponent, with
-    output-dim 1024 in this case, and input-dim determined by the dimention
+    output-dim 1024 in this case, and input-dim determined by the dimension
     of the input .
 
     Parameters of the class, and their defaults:
@@ -137,7 +193,7 @@ class XconfigLinearComponent(XconfigLayerBase):
     The following (shown with their effective defaults) are just passed through
     to the component's config line.
 
-      orthonormal-constraint=-1
+      orthonormal-constraint=0.0
       max-change=0.75
       l2-regularize=0.0
 
diff --git a/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py b/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py
index 20d9c73eaf0..99f622d79a7 100755
--- a/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py
+++ b/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py
@@ -210,7 +210,9 @@ def process_args(args):
 
     # set the options corresponding to args.use_gpu
     run_opts = common_train_lib.RunOpts()
-    if args.use_gpu:
+    if args.use_gpu in ["true", "false"]:
+        args.use_gpu = ("yes" if args.use_gpu == "true" else "no")
+    if args.use_gpu in ["yes", "wait"]:
         if not common_lib.check_if_cuda_compiled():
             logger.warning(
                 """You are running with one thread but you have not compiled
@@ -219,10 +221,9 @@ def process_args(args):
                    ./configure; make""")
 
         run_opts.train_queue_opt = "--gpu 1"
-        run_opts.parallel_train_opts = ""
+        run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu)
         run_opts.combine_queue_opt = "--gpu 1"
-        run_opts.combine_gpu_opt = ""
-
+        run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu)
 
     else:
         logger.warning("Without using a GPU this will be very slow. "
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 613b70fd192..6a68d9ecb6e 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -228,7 +228,9 @@ def process_args(args):
         args.transform_dir = args.lat_dir
     # set the options corresponding to args.use_gpu
     run_opts = common_train_lib.RunOpts()
-    if args.use_gpu:
+    if args.use_gpu in ["true", "false"]:
+        args.use_gpu = ("yes" if args.use_gpu == "true" else "no")
+    if args.use_gpu in ["yes", "wait"]:
         if not common_lib.check_if_cuda_compiled():
             logger.warning(
                 """You are running with one thread but you have not compiled
@@ -237,9 +239,9 @@ def process_args(args):
                    ./configure; make""")
 
         run_opts.train_queue_opt = "--gpu 1"
-        run_opts.parallel_train_opts = ""
+        run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu)
         run_opts.combine_queue_opt = "--gpu 1"
-        run_opts.combine_gpu_opt = ""
+        run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu)
 
     else:
         logger.warning("Without using a GPU this will be very slow. "
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 2cb314cca61..dd1c97b350d 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -118,7 +118,9 @@ def process_args(args):
 
     # set the options corresponding to args.use_gpu
     run_opts = common_train_lib.RunOpts()
-    if args.use_gpu:
+    if args.use_gpu in ["true", "false"]:
+        args.use_gpu = ("yes" if args.use_gpu == "true" else "no")
+    if args.use_gpu in ["yes", "wait"]:
         if not common_lib.check_if_cuda_compiled():
             logger.warning(
                 """You are running with one thread but you have not compiled
@@ -127,11 +129,12 @@ def process_args(args):
                    ./configure; make""")
 
         run_opts.train_queue_opt = "--gpu 1"
-        run_opts.parallel_train_opts = ""
-        run_opts.combine_gpu_opt = ""
+        run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu)
+        run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu)
         run_opts.combine_queue_opt = "--gpu 1"
-        run_opts.prior_gpu_opt = "--use-gpu=yes"
+        run_opts.prior_gpu_opt = "--use-gpu={}".format(args.use_gpu)
         run_opts.prior_queue_opt = "--gpu 1"
+
     else:
         logger.warning("Without using a GPU this will be very slow. "
                        "nnet3 does not yet support multiple threads.")
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 14922247cd3..0e787b0b647 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -126,7 +126,9 @@ def process_args(args):
 
     # set the options corresponding to args.use_gpu
     run_opts = common_train_lib.RunOpts()
-    if args.use_gpu:
+    if args.use_gpu in ["true", "false"]:
+        args.use_gpu = ("yes" if args.use_gpu == "true" else "no")
+    if args.use_gpu in ["yes", "wait"]:
         if not common_lib.check_if_cuda_compiled():
             logger.warning(
                 """You are running with one thread but you have not compiled
@@ -135,10 +137,10 @@ def process_args(args):
                    ./configure; make""")
 
         run_opts.train_queue_opt = "--gpu 1"
-        run_opts.parallel_train_opts = ""
-        run_opts.combine_gpu_opt = ""
+        run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu)
+        run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu)
         run_opts.combine_queue_opt = "--gpu 1"
-        run_opts.prior_gpu_opt = "--use-gpu=yes"
+        run_opts.prior_gpu_opt = "--use-gpu={}".format(args.use_gpu)
         run_opts.prior_queue_opt = "--gpu 1"
 
     else:
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 4623756caba..bd94fb7cb94 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -179,7 +179,9 @@ def process_args(args):
 
     # set the options corresponding to args.use_gpu
     run_opts = common_train_lib.RunOpts()
-    if args.use_gpu:
+    if args.use_gpu in ["true", "false"]:
+        args.use_gpu = ("yes" if args.use_gpu == "true" else "no")
+    if args.use_gpu in ["yes", "wait"]:
         if not common_lib.check_if_cuda_compiled():
             logger.warning(
                 """You are running with one thread but you have not compiled
@@ -188,10 +190,10 @@ def process_args(args):
                    ./configure; make""")
 
         run_opts.train_queue_opt = "--gpu 1"
-        run_opts.parallel_train_opts = ""
-        run_opts.combine_gpu_opt = ""
+        run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu)
+        run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu)
         run_opts.combine_queue_opt = "--gpu 1"
-        run_opts.prior_gpu_opt = "--use-gpu=yes"
+        run_opts.prior_gpu_opt = "--use-gpu={}".format(args.use_gpu)
         run_opts.prior_queue_opt = "--gpu 1"
 
     else:
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index fd74e5c9f44..83a1da8eca1 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -173,7 +173,9 @@ def process_args(args):
 
     # set the options corresponding to args.use_gpu
     run_opts = common_train_lib.RunOpts()
-    if args.use_gpu:
+    if args.use_gpu in ["true", "false"]:
+        args.use_gpu = ("yes" if args.use_gpu == "true" else "no")
+    if args.use_gpu in ["yes", "wait"]:
         if not common_lib.check_if_cuda_compiled():
             logger.warning(
                 """You are running with one thread but you have not compiled
@@ -182,10 +184,10 @@ def process_args(args):
                    ./configure; make""")
 
         run_opts.train_queue_opt = "--gpu 1"
-        run_opts.parallel_train_opts = ""
-        run_opts.combine_gpu_opt = ""
+        run_opts.parallel_train_opts = "--use-gpu={}".format(args.use_gpu)
+        run_opts.combine_gpu_opt = "--use-gpu={}".format(args.use_gpu)
         run_opts.combine_queue_opt = "--gpu 1"
-        run_opts.prior_gpu_opt = "--use-gpu=yes"
+        run_opts.prior_gpu_opt = "--use-gpu={}".format(args.use_gpu)
         run_opts.prior_queue_opt = "--gpu 1"
 
     else:
diff --git a/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py b/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py
index f8d5008c3e9..8c53e5e8db9 100755
--- a/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py
+++ b/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py
@@ -165,6 +165,8 @@ def run(args):
                                  axis=0)
             utts.sort(key=lambda x: segments[x][1])   # sort on start time
 
+            end_frame_accounted = 0
+
             for i, utt in enumerate(utts):
                 if utt not in segments or utt not in targets:
                     num_utt_err += 1
@@ -208,45 +210,58 @@ def run(args):
                     num_utt_err += 1
                     continue
 
+                # Fix end_frame and num_frames if the segment goes beyond
+                # the length of the recording.
                 if end_frame > reco2num_frames[reco]:
                     end_frame = reco2num_frames[reco]
                     num_frames = end_frame - start_frame
 
-                if num_frames < 0:
+                # Fix "num_frames" and "end_frame" if "num_frames" is lower
+                # than the size of the targets matrix "mat"
+                num_frames = min(num_frames, mat.shape[0])
+                end_frame = start_frame + num_frames
+
+                if num_frames <= 0:
                     logger.warning("For utterance {utt}, start-frame {start} "
                                    "is outside the recording"
                                    "".format(utt=utt, start=start_frame))
                     num_utt_err += 1
                     continue
 
-                prev_utt_end_frame = (
-                    int(segments[utts[i-1]][2] / args.frame_shift + 0.5)
-                    if i > 0 else 0)
-                if start_frame < prev_utt_end_frame:
-                    # Segment overlaps with the previous utterance
+                if end_frame < end_frame_accounted:
+                    logger.warning("For utterance {utt}, end-frame {end} "
+                                   "is before the end of a previous segment. "
+                                   "i.e. this segment is completely within "
+                                   "another segment. Ignoring this segment."
+                                   "".format(utt=utt, end=end_frame))
+                    num_utt_err +=1
+                    continue
+
+                if start_frame < end_frame_accounted:
+                    # Segment overlaps with a previous utterance
                     # Combine targets using a weighted interpolation using a
                     # triangular window with a weight of 1 at the start/end of
                     # overlap and 0 at the end/start of the segment
-                    for n in range(0, prev_utt_end_frame - start_frame):
-                        w = float(n) / float(prev_utt_end_frame - start_frame)
+                    for n in range(0, end_frame_accounted - start_frame):
+                        w = float(n) / float(end_frame_accounted - start_frame)
                         reco_mat[n + start_frame, :] = (
                             reco_mat[n + start_frame, :] * (1.0 - w)
                             + mat[n, :] * w)
 
-                    num_frames = min(num_frames, mat.shape[0])
-                    end_frame = start_frame + num_frames
-                    reco_mat[prev_utt_end_frame:end_frame, :] = (
-                        mat[(prev_utt_end_frame-start_frame):
-                            (end_frame-start_frame), :])
+                    if end_frame > end_frame_accounted:
+                        reco_mat[end_frame_accounted:end_frame, :] = (
+                            mat[(end_frame_accounted-start_frame):
+                                (end_frame-start_frame), :])
                 else:
                     # No overlap with the previous utterances.
                     # So just add it to the output.
-                    num_frames = min(num_frames, mat.shape[0])
-                    reco_mat[start_frame:(start_frame + num_frames), :] = (
+                    reco_mat[start_frame:end_frame, :] = (
                         mat[0:num_frames, :])
                 logger.debug("reco_mat shape = %s, mat shape = %s, "
                              "start_frame = %d, end_frame = %d", reco_mat.shape,
                              mat.shape, start_frame, end_frame)
+
+                end_frame_accounted = end_frame
                 num_utt += 1
 
             if reco_mat.shape[0] > 0:
diff --git a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh
index f8557a70177..20bcfd96d96 100755
--- a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh
+++ b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh
@@ -66,8 +66,10 @@ if [ $# -ne 6 ]; then
   Usage: $0 <lang> <data> <whole-recording-data> <ali-model-dir> <model-dir> <dir>
    e.g.: $0 data/lang data/train data/train_whole exp/tri5 exp/tri4 exp/segmentation_1a
   
-  Note: Both <data> and <whole-recording-data> must have the recording-id
-  as speaker, and must contain feats.scp.
+  Note: <whole-recording-data> is expected to have feats.scp and <data> 
+  expected to have segments file. We will get the features for <data> by 
+  using row ranges of <whole-recording-data>/feats.scp. This script will 
+  work on a copy of <data> created to have the recording-id as the speaker-id.
 EOF
   exit 1
 fi
@@ -97,8 +99,7 @@ else
   extra_files="$extra_files $graph_dir/HCLG.fst $graph_dir/phones.txt"
 fi
 
-for f in $in_data_dir/feats.scp $in_whole_data_dir/feats.scp \
-  $in_data_dir/segments \
+for f in $in_whole_data_dir/feats.scp $in_data_dir/segments \
   $lang/phones.txt $garbage_phones_list $silence_phones_list \
   $ali_model_dir/final.mdl $model_dir/final.mdl $extra_files; do
   if [ ! -f $f ]; then
@@ -125,8 +126,7 @@ if [ $stage -le 0 ]; then
 
   utils/data/modify_speaker_info_to_recording.sh \
     $in_data_dir $dir/$data_id || exit 1
-  steps/compute_cmvn_stats.sh $dir/$data_id || exit 1
-  utils/validate_data_dir.sh $dir/$data_id || exit 1
+  utils/validate_data_dir.sh --no-feats $dir/$data_id || exit 1
 fi 
 
 # Work with a temporary data directory with recording-id as the speaker labels.
@@ -135,6 +135,13 @@ data_dir=$dir/${data_id}
 ###############################################################################
 # Get feats for the manual segments
 ###############################################################################
+if [ $stage -le 1 ]; then
+  utils/data/subsegment_data_dir.sh $in_whole_data_dir ${data_dir}/segments ${data_dir}/tmp
+  cp $data_dir/tmp/feats.scp $data_dir
+
+  steps/compute_cmvn_stats.sh $data_dir || exit 1
+fi
+
 if [ $stage -le 2 ]; then
   utils/copy_data_dir.sh $in_whole_data_dir $dir/$whole_data_id
 
diff --git a/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh b/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
index 5db6be731ce..dd315cc405b 100755
--- a/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
+++ b/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
@@ -46,7 +46,7 @@ utils/data/internal/combine_segments_to_recording.py \
   --write-reco2utt=$dir/reco2sorted_utts $data/segments $dir/utt2spk || exit 1
 
 if [ -f $data/text ]; then
-  utils/apply_map.pl -f 2 $data/text < $dir/reco2sorted_utts > $dir/text || exit 1
+  utils/apply_map.pl -f 2- $data/text < $dir/reco2sorted_utts > $dir/text || exit 1
 fi
 
 rm $dir/reco2sorted_utts
diff --git a/egs/wsj/s5/utils/prepare_lang.sh b/egs/wsj/s5/utils/prepare_lang.sh
index 47670a2065a..fa5ff7856b0 100755
--- a/egs/wsj/s5/utils/prepare_lang.sh
+++ b/egs/wsj/s5/utils/prepare_lang.sh
@@ -85,7 +85,7 @@ if [ $# -ne 4 ]; then
   echo "     --position-dependent-phones (true|false)        # default: true; if true, use _B, _E, _S & _I"
   echo "                                                     # markers on phones to indicate word-internal positions. "
   echo "     --share-silence-phones (true|false)             # default: false; if true, share pdfs of "
-  echo "                                                     # all non-silence phones. "
+  echo "                                                     # all silence phones. "
   echo "     --sil-prob <probability of silence>             # default: 0.5 [must have 0 <= silprob < 1]"
   echo "     --phone-symbol-table <filename>                 # default: \"\"; if not empty, use the provided "
   echo "                                                     # phones.txt as phone symbol table. This is useful "
@@ -115,7 +115,7 @@ silprob=false
   echo "*Error validating directory $srcdir*" && exit 1;
 
 if [[ ! -f $srcdir/lexicon.txt ]]; then
-  echo "**Creating $dir/lexicon.txt from $dir/lexiconp.txt"
+  echo "**Creating $srcdir/lexicon.txt from $srcdir/lexiconp.txt"
   perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdir/lexiconp.txt > $srcdir/lexicon.txt || exit 1;
 fi
 if [[ ! -f $srcdir/lexiconp.txt ]]; then
diff --git a/src/configure b/src/configure
index df7f0a96dd2..277bb340781 100755
--- a/src/configure
+++ b/src/configure
@@ -37,7 +37,7 @@
 #        # addition of the the --android-includes flag because the toolchains
 #        # produced by the Android NDK don't always include the C++ stdlib
 #        # headers in the normal cross compile include path.
-# --host=aarch64-linux-android 
+# --host=aarch64-linux-android
 #        # support for 64bit ARMv8(AArch64) architecture in Android.
 
 # This should be incremented after any significant change to the configure
@@ -426,10 +426,10 @@ function configure_cuda {
     fi
 
     case $CUDA_VERSION in
-      5_5) CUDA_ARCH="-gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35" ;;
-      6_*) CUDA_ARCH="-gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50" ;;
-      7_*) CUDA_ARCH="-gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53" ;;
-      8_*) CUDA_ARCH="-gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62" ;;
+      5_5) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35" ;;
+      6_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50" ;;
+      7_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53" ;;
+      8_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62" ;;
       9_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_70,code=sm_70" ;;
       *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1 ;;
     esac
diff --git a/src/feat/wave-reader.cc b/src/feat/wave-reader.cc
index 15b53d93d7d..f8259a3a82e 100644
--- a/src/feat/wave-reader.cc
+++ b/src/feat/wave-reader.cc
@@ -132,7 +132,23 @@ void WaveInfo::Read(std::istream &is) {
   uint32 riff_chunk_read = 0;
   riff_chunk_read += 4;  // WAVE included in riff_chunk_size.
 
-  reader.Expect4ByteTag("fmt ");
+  // Possibly skip any RIFF tags between 'WAVE' and 'fmt '.
+  // Apple devices produce a filler tag 'JUNK' for memory alignment.
+  reader.Read4ByteTag();
+  riff_chunk_read += 4;
+  while (strcmp(reader.tag,"fmt ") != 0) {
+    uint32 filler_size = reader.ReadUint32();
+    riff_chunk_read += 4;
+    for (uint32 i = 0; i < filler_size; i++) {
+      is.get(); // read 1 byte,
+    }
+    riff_chunk_read += filler_size;
+    // get next RIFF tag,
+    reader.Read4ByteTag();
+    riff_chunk_read += 4;
+  }
+
+  KALDI_ASSERT(strcmp(reader.tag,"fmt ") == 0);
   uint32 subchunk1_size = reader.ReadUint32();
   uint16 audio_format = reader.ReadUint16();
   num_channels_ = reader.ReadUint16();
@@ -190,9 +206,8 @@ void WaveInfo::Read(std::istream &is) {
     KALDI_ERR << "Unexpected block_align: " << block_align << " vs. "
               << num_channels_ << " * " << (bits_per_sample/8);
 
-  riff_chunk_read += 8 + subchunk1_size;
-  // size of what we just read, 4 bytes for "fmt " + 4
-  // for subchunk1_size + subchunk1_size itself.
+  riff_chunk_read += 4 + subchunk1_size;
+  // size of what we just read, 4 for subchunk1_size + subchunk1_size itself.
 
   // We support an optional "fact" chunk (which is useless but which
   // we encountered), and then a single "data" chunk.
@@ -217,10 +232,7 @@ void WaveInfo::Read(std::istream &is) {
     riff_chunk_read += 4;
   }
 
-  if (strcmp(reader.tag, "data"))
-    KALDI_ERR << "WaveData: expected data chunk, got instead "
-              << reader.tag;
-
+  KALDI_ASSERT(strcmp(reader.tag, "data") == 0);
   uint32 data_chunk_size = reader.ReadUint32();
   riff_chunk_read += 4;
 
diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc
index 9a4559803ad..77d78113bbb 100644
--- a/src/nnet3/nnet-compile.cc
+++ b/src/nnet3/nnet-compile.cc
@@ -322,10 +322,23 @@ void Compiler::CreateStepInfo(
                                                  stride_type);
     } else {
       // kDimRange.  Will just be a sub-matrix of a Component or Input node.
-      int32 cindex_id = this_info.output_cindex_ids.front(),
-          input_cindex_id = graph_.dependencies[cindex_id][0],
-          input_step = cindex_id_to_location_[input_cindex_id].first;
-      KALDI_ASSERT(input_step != -1 && input_step < step);
+      std::vector<int32>::const_iterator
+          iter = this_info.output_cindex_ids.begin(),
+          end = this_info.output_cindex_ids.end();
+      int32 source_cindex_id = -1;
+      for (; iter != end; ++iter) {
+        int32 cindex_id = *iter;
+        if (!graph_.dependencies[cindex_id].empty()) {
+          KALDI_ASSERT(graph_.dependencies[cindex_id].size() == 1);
+          source_cindex_id = graph_.dependencies[cindex_id][0];
+          break;
+        }
+      }
+      KALDI_ASSERT(source_cindex_id >= 0);
+      int32 input_step = cindex_id_to_location_[source_cindex_id].first;
+      KALDI_ASSERT(this_info.output_cindex_ids.size() ==
+                   steps_[input_step].output_cindex_ids.size());
+      KALDI_ASSERT(input_step >= 0 && input_step < step);
       KALDI_PARANOID_ASSERT(this_info.output_indexes ==
                             steps_[input_step].output_indexes);
       this_info.value = computation->NewSubMatrix(steps_[input_step].value,
@@ -376,6 +389,8 @@ void Compiler::CreateStepInfo(
         KALDI_ASSERT(cur_dim_offset == desc.Dim(nnet_));
       }
     }
+    KALDI_ASSERT(static_cast<int32>(this_info.output_cindex_ids.size()) ==
+                 computation->submatrices[this_info.value].num_rows);
   }
 }
 
diff --git a/src/nnet3/nnet-compile.h b/src/nnet3/nnet-compile.h
index 955ac47cbe1..21918c0e539 100644
--- a/src/nnet3/nnet-compile.h
+++ b/src/nnet3/nnet-compile.h
@@ -134,7 +134,9 @@ class Compiler {
                           std::vector<bool> *deriv_needed);
 
   // this sets up steps_, destroying the input "by_step" in the process.  It
-  // also sets various matrix and sub-matrix sizes in "computation".
+  // also sets various matrix and sub-matrix sizes in "computation".  The input
+  // 'by_step' is elsewhere referred to as just 'step'; it is a vector of steps,
+  // and each step is a vector of cindex_ids that are computed by that step.
   void CreateStepInfo(const std::vector<bool> &deriv_needed,
                       const std::vector<int32> &step_to_segment,
                       std::vector<std::vector<int32> > *by_step,
diff --git a/src/nnet3/nnet-computation-graph.cc b/src/nnet3/nnet-computation-graph.cc
index bded9e84b2f..9c84115d406 100644
--- a/src/nnet3/nnet-computation-graph.cc
+++ b/src/nnet3/nnet-computation-graph.cc
@@ -1643,7 +1643,7 @@ int32 ComputationStepsComputer::AddStep(const std::vector<Cindex> &cindexes,
       *out_iter = cindex_id;
       if (added) {
         KALDI_ASSERT(cindex_id == static_cast<int32>(locations_->size()));
-        locations_->resize(cindex_id + 1);
+        locations_->resize(cindex_id + 1, std::pair<int32, int32>(-1, -1));
         locations_->back().first = step_index;
         locations_->back().second = row_index;
         locations = &((*locations_)[0]);  // in case it was reallocated
@@ -1867,68 +1867,52 @@ void ComputationStepsComputer::ProcessDimRangeSubPhase(
   ConvertToCindexIds(input_cindexes, &input_cindex_ids);
   std::vector<std::pair<int32, int32> > locations;
   ConvertToLocations(input_cindex_ids, &locations);
-  std::sort(locations.begin(), locations.end());
+
+  // get a list of the source step indexes (corresponding to computations for the
+  // source component-node)
+  std::unordered_set<int32> source_step_indexes;
   KALDI_ASSERT(!locations.empty());
   std::vector<std::pair<int32, int32> >::const_iterator
       locations_iter = locations.begin(),
       locations_end = locations.end();
-  // Each unique .first number in locations (i.e. each source step, and they
-  // will all correspond to component-output or input steps) will generate one
-  // 'step' of type kDimRange.  Because dim-range nodes must be contiguous
-  // ranges of a source step (since they are represented as sub-matrices), for
-  // each source step we work out the first and last row-index (i.e. first and
-  // last .second member of locations) and use that to reconstruct the range.
-
-  // each element of 'steps' will be (source_step, (begin_row, end_row)) so that
-  // the source of the dim-range node is indexes begin_row ... end_row-1 in that
-  // source step.
-  std::vector<std::pair<int32, std::pair<int32, int32> > > steps;
-
-  int32 cur_source_step = locations_iter->first,
-      cur_row_begin = locations_iter->second,
-      cur_row_end = cur_row_begin + 1;
-  while (1) {
-    ++locations_iter;
-    if (locations_iter == locations_end ||
-        locations_iter->first != cur_source_step) {
-      // we reached the end of a run of the same step.
-      std::pair<int32, std::pair<int32, int32> > this_step;
-      this_step.first = cur_source_step;
-      this_step.second.first = cur_row_begin;
-      this_step.second.second = cur_row_end;
-      steps.push_back(this_step);
-      if (locations_iter != locations_end) {
-        cur_source_step = locations_iter->first;
-        cur_row_begin = locations_iter->second;
-        cur_row_end = cur_row_begin + 1;
-      } else {
-        break;
-      }
-    } else {
-      cur_row_end = locations_iter->second + 1;
+
+  // 'cur_source_step_index' is just an optimization to prevent unnecessary
+  // unordered_set inserts.
+  int32 cur_source_step_index = -1;
+  for (; locations_iter != locations_end; ++locations_iter) {
+    int32 source_step_index = locations_iter->first;
+    if (source_step_index != cur_source_step_index) {
+      cur_source_step_index = source_step_index;
+      source_step_indexes.insert(cur_source_step_index);
     }
   }
 
-  for (size_t i = 0; i < steps.size(); i++) {
-    // iterating over different source steps, although normally
-    // there will be just one.
-    int32 source_step = steps[i].first,
-        row_begin = steps[i].second.first,
-        row_end = steps[i].second.second;
-    // 'source' is just the elements of the source step that we're consuming.
-    std::vector<int32> source((*steps_)[source_step].begin() + row_begin,
-                              (*steps_)[source_step].begin() + row_end);
+  std::unordered_set<int32>::const_iterator
+      source_step_iter = source_step_indexes.begin(),
+      source_step_end = source_step_indexes.end();
+  // iterating over the indexes of the source steps.
+  for (; source_step_iter != source_step_end; ++source_step_iter) {
+    int32 source_step_index = *source_step_iter;
+    std::pair<int32, int32> p(source_step_index, dim_range_node);
+    if (dim_range_nodes_.count(p) > 0) {
+      // We don't need to do anything; a dim-range node already exists for this
+      // step and this node index.
+      continue;
+    }
+    dim_range_nodes_.insert(p);
+    const std::vector<int32> &source_step = (*steps_)[source_step_index];
+    // 'cindexes' will be the cindexes of the new step that we're going to add.
     std::vector<Cindex> cindexes;
-    ConvertToCindexes(source, &cindexes);
+    ConvertToCindexes(source_step, &cindexes);
     std::vector<Cindex>::iterator iter = cindexes.begin(),
         end = cindexes.end();
     for (; iter != end; ++iter)
       iter->first = dim_range_node;
     bool add_if_absent = true;
     // this add_if_absent says, even if cindexes were not in the graph,
-    // add them.  This is possible in principle; it's to satisfy the
-    // requirement that DimRangeNodes be implemented as contiguous ranges
-    // of rows of component nodes or input nodes.
+    // add them.  This is possible; the step will contain all cindexes for the
+    // input step, even if they won't be needed.  (This is costless; it's just
+    // setting up a sub-matrix).
     AddStep(cindexes, add_if_absent);
   }
 }
diff --git a/src/nnet3/nnet-computation-graph.h b/src/nnet3/nnet-computation-graph.h
index 7999f2208ad..c0662756502 100644
--- a/src/nnet3/nnet-computation-graph.h
+++ b/src/nnet3/nnet-computation-graph.h
@@ -439,7 +439,7 @@ class ComputationStepsComputer {
   ///                         (step-index, index-into-step), so that for any cindex_id c,
   ///                         (*steps)[locations[c].first][locations[c].second] == c.
   ///                          It's possible in principle if there are non-simple
-  ///                          Components, that for node corresponding to component-input
+  ///                          Components, that for nodes corresponding to component-input
   ///                          descriptors, a cindex might be present in more than one step,
   ///                          so it doesn't follow that if (*steps)[i][j] == c, then
   ///                          locations[c] == (i,j).
@@ -547,6 +547,15 @@ class ComputationStepsComputer {
   /// (*steps_)[i][j] == c.  This is also an output (we get the pointer in
   /// the constructor).
   std::vector<std::pair<int32, int32> > *locations_;
+
+
+  /// dim_range_nodes_ is used when allocating steps for nodes of type kDimRangeNode.
+  /// This is a set of (source_step, dim_range_node_index),
+  /// where source_step is the step in which we computed of the input
+  /// of the dim-range node (this step will be for a node of type kComponentNode).
+  /// This just tells us whether we've already added a particular dim-range node
+  /// for this step, so we know whether we need to add it again.
+  std::unordered_set<std::pair<int32, int32>, PairHasher<int32> > dim_range_nodes_;
 };
 
 
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index d056a71498c..aefcb94c465 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -395,12 +395,17 @@ struct NnetComputation {
   // These are owned here.
   std::vector<PrecomputedIndexesInfo> component_precomputed_indexes;
 
-  // used in kAddRows, kAddToRows, kCopyRows, kCopyToRows.  contains row-indexes.
+  // Used in commands kAddRows, kAddToRows, kCopyRows, which
+  // contain indexes into this data-member.
+  // Each vector<int32> is a vector of row-indexes (with -1 usually treated as
+  // a special case meaning "don't do anything for this row" for add
+  // commands, or "use zero" for copy commands.
   std::vector<std::vector<int32> > indexes;
 
-  // used kAddRowsMulti, kAddToRowsMulti, kCopyRowsMulti, kCopyToRowsMulti.
-  // contains pairs (sub-matrix index, row index)- or (-1,-1) meaning don't
-  // do anything for this row.
+  // Used in commands kAddRowsMulti, kAddToRowsMulti, kCopyRowsMulti and
+  // kCopyToRowsMulti.  Contains pairs (sub-matrix index, row index)- or the
+  // special pair (-1,-1) meaning "don't do anything for this row" for add
+  // commands, or "use zero" for copy commands.
   std::vector<std::vector<std::pair<int32,int32> > > indexes_multi;
 
 
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index c53fba815fb..756ea45e894 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -2576,6 +2576,328 @@ bool SnipRowOps(NnetComputation *computation) {
 
 
 
+// This class implements the internals of the function SplitRowOps() which is
+// declared in nnet-optimize-utils.h.
+class RowOpsSplitter {
+ public:
+  RowOpsSplitter(NnetComputation *computation): computation_(computation) { }
+
+  // Attempts to perform the optimization.  Returns true if it made any change
+  // to the computation.
+  bool Split() {
+    return SplitIndexes() && SplitCommands();
+  }
+
+ private:
+
+  // This function sets up split_info_, which describes how we can split up
+  // the vectors that are elements of computation_->indexes_multi.
+  // It will return true if it successfully split at least one of those
+  // vectors, and false otherwise.
+  bool SplitIndexes();
+
+  // This function modifies the commands in the computation.  It returns
+  // true if it made any change.
+  bool SplitCommands();
+
+
+  // This function attempts to optimize the command in
+  // computation_->commands[command_index].  It returns true if it made any
+  // change.  If we are going to have to insert an extra command into the
+  // computation, this function will append an element to new_commands_.
+  bool SplitCommand(int32 command_index);
+
+  // Below, define a multi-index as an element of NnetComputation::indexes_multi,
+  // for example,
+  // const std::vector<std::pair<int32,int32> > &multi_index = computation_->indexes_multi[1];
+  // It is a list of pairs.
+
+  // This struct appears as an element of the list inside MultiIndexSplitInfo.
+  // It helps us describe how we can split up a multi-index (a list of pairs)
+  // into a sequence of ranges where the .first value is constant across the
+  // range.
+  struct SingleSplitInfo {
+    // 'offset' is the index into the vector of pairs that forms the
+    // start of this range.  In the example where we are splitting up
+    // ((10,2), (10,3), (10,4), (15,3), (15,5), (15,7))
+    // there would be two instances of struct SingleSplitInfo, with
+    // offset = 0 and offset = 3.
+    int32 offset;
+    // 'size' is the number of pairs in this range; in the example
+    // above, both 'size' elements would be 3.
+    int32 size;
+    // first_value is the value of the .first index throughout this range; in
+    // the example above, it would be 10 and 15 respectively.  It represents a
+    // submatrix index.
+    int32 first_value;
+
+    // initial_second_value is the minimum value of .second for any element in
+    // this range: it would be 2 and 3 respectively in the example above.
+    int32 min_second_value;
+
+    // second_value_range is the highest value of .second for any element in
+    // this range, plus one, minus min_second_value.  (It's the number of rows
+    // in the other submatrix of the operation).
+    int32 second_value_range;
+
+    // If the .second values in the range are consecutive then
+    // 'second_value_offsets' will be empty.  Otherwise it will
+    // be a vector of size 'size', containing numbers in the
+    // range 0 ... second_value_range - 1, such that
+    // min_second_value + second_value_offsets[i] gives
+    // the .second value at the corresponding position in the range.
+    // In the second range of the example above, the range
+    // consisting of ((15,3), (15,5), (15,7)), 'second_value_offsets
+    // would be the vector (0, 2, 4).
+    std::vector<int32> second_value_offsets;
+  };
+
+  // An instance of the struct MultiIndexSplitInfo will be created for each multi-index,
+  // i.e. for each element of  NnetComputation::indexes_multi.
+  struct MultiIndexSplitInfo {
+    // If we can split this multi-index into at most two ranges, this
+    // vector will be nonempty; otherwise it will be empty.
+    std::vector<SingleSplitInfo> splits;
+  };
+
+  // GetSplitInfo() attempts to take a range of a
+  // std::vector<std::pair<int32, int32> >, as represented by begin and end
+  // iterators, and to extract its information into an object of type
+  // SingleSplitInfo.  (all except for the .offset member, which will have
+  // been set by calling code).
+  // It return true if successful, and false otherwise.  The only reasons that
+  // it might return false are that the range contains -1's or does not contain
+  // all-identical .first members).
+  bool GetSplitInfo(std::vector<std::pair<int32, int32> >::const_iterator begin,
+                    std::vector<std::pair<int32, int32> >::const_iterator end,
+                    SingleSplitInfo *info);
+
+  // computation_ is the computation that we are modifying.
+  NnetComputation *computation_;
+  // split_info_ will contain information about how we can split up the members
+  // of computation_->indexes_multi into ranges.
+  std::vector<MultiIndexSplitInfo> split_info_;
+  // The following is a list of additional commands that we are going to insert
+  // into computation_, of the form (command-index, command) where command-index
+  // is a command index just before which we will insert the new command.
+  // (this is the format accepted by the function InsertCommands()).
+  std::vector<std::pair<int32, NnetComputation::Command> > new_commands_;
+
+};
+
+
+bool RowOpsSplitter::GetSplitInfo(
+    std::vector<std::pair<int32, int32> >::const_iterator begin,
+    std::vector<std::pair<int32, int32> >::const_iterator end,
+    SingleSplitInfo *info) {
+  // max_size_ratio must be > 1.0, and could in principle be a float.  It is
+  // there to prevent us from making changes to the computation which would end
+  // up wastefully launching too many kernels that would do nothing.
+  const int32 max_size_ratio = 2;
+
+  int32 size = end - begin;
+  KALDI_ASSERT(size != 0);
+  int32 first = begin->first;
+  if (first < 0)
+    return false;
+  info->size = size;
+  info->first_value = first;
+  int32 initial_second_value = begin->second,
+      min_second_value = initial_second_value,
+      max_second_value = initial_second_value;
+  info->second_value_offsets.resize(size);
+  bool is_consecutive = true;
+  for (int32 i = 0; i < size; i++) {
+    int32 second = begin[i].second;
+    if (begin[i].first != first || second < 0) return false;
+    info->second_value_offsets[i] = second;
+    if (second != initial_second_value + i)
+      is_consecutive = false;
+    if (second < min_second_value) min_second_value = second;
+    if (second > max_second_value) max_second_value = second;
+  }
+  info->min_second_value = min_second_value;
+  info->second_value_range = max_second_value + 1 - min_second_value;
+  if (info->second_value_range > size * max_size_ratio)
+    return false;
+  if (is_consecutive) {
+    info->second_value_offsets.clear();
+  } else {
+    for (int32 i = 0; i < size; i++)
+      info->second_value_offsets[i] -= min_second_value;
+  }
+  return true;
+}
+
+
+bool RowOpsSplitter::SplitIndexes() {
+  bool ans = false;
+  int32 num_indexes_multi = computation_->indexes_multi.size();
+  split_info_.resize(num_indexes_multi);
+  for (int32 i = 0; i < num_indexes_multi; i++) {
+    const std::vector<std::pair<int32,int32> > &multi_index =
+        computation_->indexes_multi[i];
+    MultiIndexSplitInfo &split_info = split_info_[i];
+
+    int32 num_pairs = multi_index.size();
+    KALDI_ASSERT(num_pairs > 0);
+    // 'split_point' will be set to the first index j for which
+    // multi_index[j-1].first != multi_index[j].first, or -1
+    // if no such j exists.
+    int32 split_point = -1, initial_first = multi_index[0].first;
+    for (int32 j = 1; j < num_pairs; j++) {
+      if (multi_index[j].first != initial_first) {
+        split_point = j;
+        break;
+      }
+    }
+    if (split_point == -1) {
+      split_info.splits.resize(1);
+      split_info.splits[0].offset = 0;
+      if (!GetSplitInfo(multi_index.begin(), multi_index.end(),
+                        &(split_info.splits[0]))) {
+        split_info.splits.clear();
+      } else {
+        ans = true;
+      }
+    } else {
+      split_info.splits.resize(2);
+      split_info.splits[0].offset = 0;
+      split_info.splits[1].offset = split_point;
+
+      std::vector<std::pair<int32,int32> >::const_iterator mid_iter =
+          multi_index.begin() + split_point;
+      if (!GetSplitInfo(multi_index.begin(), mid_iter,
+                        &(split_info.splits[0])) ||
+          !GetSplitInfo(mid_iter, multi_index.end(),
+                        &(split_info.splits[1]))) {
+        split_info.splits.clear();
+      } else {
+        ans = true;
+      }
+    }
+  }
+  return ans;
+}
+
+bool RowOpsSplitter::SplitCommand(int32 c) {
+  NnetComputation::Command &command = computation_->commands[c];
+  CommandType command_type = command.command_type;
+  // For commands that are not of the following four types, return false: we
+  // won't be changing these commands.
+  switch (command_type) {
+    case kAddRowsMulti: case kCopyRowsMulti:
+    case kAddToRowsMulti: case kCopyToRowsMulti: break;
+    default: return false;
+  }
+  int32 indexes_multi_index = command.arg2;
+  KALDI_ASSERT(indexes_multi_index <
+               static_cast<int32>(split_info_.size()));
+  const MultiIndexSplitInfo &split_info = split_info_[indexes_multi_index];
+  if (split_info.splits.empty())
+    return false;  // these indexes couldn't be split: e.g. they contained more
+                   // than two distinct .first elements, or there were other
+                   // reasons.
+
+  // we'll be splitting the command into either one or two pieces.
+  std::vector<NnetComputation::Command> split_commands(
+      split_info.splits.size());
+  for (size_t i = 0; i < split_info.splits.size(); i++) {
+    const SingleSplitInfo &split = split_info.splits[i];
+    NnetComputation::Command &command_out = split_commands[i];
+    command_out.alpha = command.alpha;
+    command_out.arg1 = computation_->NewSubMatrix(
+        command.arg1, split.offset, split.size, 0, -1);
+    command_out.arg2 = computation_->NewSubMatrix(
+        split.first_value, split.min_second_value,
+        split.second_value_range, 0, -1);
+
+    if (split.second_value_offsets.empty()) {
+      // The .second elements are consecutive.
+      switch (command_type) {
+        case kAddRowsMulti:
+          command_out.command_type = kMatrixAdd;
+          break;
+        case kCopyRowsMulti:
+          command_out.command_type = kMatrixCopy;
+          break;
+        case kAddToRowsMulti:
+          command_out.command_type = kMatrixAdd;
+          std::swap(command_out.arg1, command_out.arg2);
+          break;
+        case kCopyToRowsMulti:
+          command_out.command_type = kMatrixCopy;
+          std::swap(command_out.arg1, command_out.arg2);
+          break;
+        default:  // will never be reached.
+          break;
+      }
+    } else {
+      // Indexes are not consecutive: it needs to be a kAddRows or kCopyRows
+      // command.
+      command_out.arg3 = computation_->indexes.size();
+      switch (command_type) {
+        case kAddRowsMulti: case kCopyRowsMulti: {
+          command_out.command_type = (command_type == kAddRowsMulti ?
+                                      kAddRows : kCopyRows);
+          computation_->indexes.push_back(split.second_value_offsets);
+          break;
+        }
+        case kCopyToRowsMulti:  {
+          // We can't operate on this command because of what would happen
+          // with values of 'indexes' (see the variable in the block for
+          // kAddToRowsMulti) which were -1.  Rows of the output would be
+          // set to zero, which is not the behavior we want here; we'd want
+          // them to be unaffected.
+          return false;
+        }
+        case kAddToRowsMulti: {
+          command_out.command_type = kAddRows;
+          std::swap(command_out.arg1, command_out.arg2);
+          // invert the indexes.
+          std::vector<int32> indexes(split.second_value_range, -1);
+          for (int32 i = 0; i < split.size; i++) {
+            // the following assert should always succeed because the
+            // AddToRowsMulti and CopyToRowsMulti should never have
+            // duplicate destinations in their indexes.
+            KALDI_ASSERT(indexes[split.second_value_offsets[i]] >= 0);
+            indexes[split.second_value_offsets[i]] = i;
+          }
+          computation_->indexes.push_back(indexes);
+          break;
+        }
+        default:
+          KALDI_ERR << "Code error: un-handled case.";
+      }
+    }
+  }
+  command = split_commands[0];
+  // note: for now, split_commands.size() will be 1 or 2.
+  for (size_t i = 1; i < split_commands.size(); i++) {
+    new_commands_.resize(new_commands_.size() + 1);
+    // we'll want to insert this command right after command c,
+    // which is the same as just before command c + 1.
+    new_commands_.back().first = c + 1;
+    new_commands_.back().second = split_commands[i];
+  }
+  return true;  // We made a change.
+}
+
+bool RowOpsSplitter::SplitCommands() {
+  bool ans = false;
+  int32 num_commands = computation_->commands.size();
+  for (int32 c = 0; c < num_commands; c++)
+    if (SplitCommand(c))
+      ans = true;
+  if (!new_commands_.empty())
+    InsertCommands(&new_commands_, computation_);
+  return ans;
+}
+
+bool SplitRowOps(NnetComputation *computation) {
+  RowOpsSplitter splitter(computation);
+  return splitter.Split();
+}
 
 
 /*
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index 703f43af095..32adf9e3e19 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -455,6 +455,23 @@ bool ReplaceRowWithMatrixOps(NnetComputation *computation);
 /// computation->indexes.
 bool SnipRowOps(NnetComputation *computation);
 
+
+/// This function detects cases where commands of type kAddRowsMulti,
+/// kAddToRowsMulti, kCopyRowsMulti, kCopyToRowsMulti use indexes that
+/// correspond to at most two submatrices, in two distinct ranges without gaps
+/// filled by -1's, and could be converted to at most two commands of type
+/// kMatrixAdd, kMatrixCopy, kAddRows or kCopyRows.  (Note: it's important that
+/// this optimization takes place after SnipRowOps, because it doesn't remove
+/// the -1's from the edges of the indexes, it relies on that operation doing
+/// so).  The "without-gaps" stipulation is just for convenience of
+/// implementation, to have fewer cases to worry about.
+///
+/// This function returns true if it made any changes to the computation; if it
+/// returns true, then after calling this you should at some point do
+/// RenumberComputation(), which will remove any now-unused members of
+/// computation->indexes.
+bool SplitRowOps(NnetComputation *computation);
+
 /// This function detects submatrices and matrices that are never used (e.g. due
 /// to changes made in other optimization code), and members of indexes,
 /// indexes_multi and indexes_ranges that are unused or are duplicates, and memo
@@ -535,18 +552,18 @@ void IdentifyIndexesRangesArgs(std::vector<NnetComputation::Command> *commands,
                                std::vector<int32*> *indexes_ranges_args);
 
 /// Inserts commands into the computation at the requested places.  'commands'
-///  is a list of pairs (command-index, command) that is expected to be sorted
-///  on command-index.  For each entry (c, command) in 'commands', 'command' is
-///  inserted into 'computation' just *before* the command that (at entry) is in
-///  computation->commands[c].  If there are multiple pairs with the same index
-///  c, they will remain in the same order in which they were present in
-///  'commands'; however, 'commands' does not have to be sorted on 'c'.
-///  As a special case, if c == computation->commands.size(), the
-///  corresponding commands are inserted at the beginning of the computation.
-///  This function will appropriately renumber the argument of the kGotoLabel
-///  command of any 'looped' computation.  Command indexes c in commands[*].first
-///  must be in the range [0, computation->commands.size()].
-///  This function may modify 'commands' by sorting it.
+/// is a list of pairs (command-index, command) that is expected to be sorted on
+/// command-index.  For each entry (c, command) in 'commands', 'command' is
+/// inserted into 'computation' just *before* the command that (at entry) is in
+/// computation->commands[c].  If there are multiple pairs with the same index
+/// c, they will remain in the same order in which they were present in
+/// 'commands'; however, 'commands' does not have to be sorted on 'c'.  As a
+/// special case, if c == computation->commands.size(), the corresponding
+/// commands are inserted at the beginning of the computation.  This function
+/// will appropriately renumber the argument of the kGotoLabel command of any
+/// 'looped' computation.  Command indexes c in commands[*].first must be in the
+/// range [0, computation->commands.size()].  This function may modify
+/// 'commands' by sorting it.
 void InsertCommands(
     std::vector<std::pair<int32, NnetComputation::Command> > *commands,
     NnetComputation *computation);
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index d614afce7d0..ecce196801b 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -41,6 +41,14 @@ void NnetOptimizeOptions::Read(std::istream &is, bool binary) {
   if (tok == "<OptimizeRowOps>") {
     ReadBasicType(is, binary, &optimize_row_ops);
     ReadToken(is, binary, &tok);
+  } else {
+    optimize_row_ops = true;
+  }
+  if (tok == "<SplitRowOps>") {
+    ReadBasicType(is, binary, &split_row_ops);
+    ReadToken(is, binary, &tok);
+  } else {
+    split_row_ops = true;
   }
   KALDI_ASSERT(tok == "<ConvertAddition>");
   ReadBasicType(is, binary, &convert_addition);
@@ -516,12 +524,16 @@ void Optimize(const NnetOptimizeOptions &config,
   }
 
 
-  if (config.optimize && (config.snip_row_ops || config.optimize_row_ops)) {
+  if (config.optimize &&  (config.snip_row_ops || config.optimize_row_ops ||
+                           config.split_row_ops)) {
     bool must_renumber = false;
     if (config.snip_row_ops && SnipRowOps(computation))
       must_renumber = true;
+    if (config.split_row_ops && SplitRowOps(computation))
+      must_renumber = true;
     if (config.optimize_row_ops && ReplaceRowWithMatrixOps(computation))
       must_renumber = true;
+
     if (must_renumber) {
       RenumberComputation(computation);
       if (GetVerboseLevel() >= 3)
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index 31872e46b72..a07c5490c5c 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -39,6 +39,7 @@ struct NnetOptimizeOptions {
   bool propagate_in_place;
   bool backprop_in_place;
   bool optimize_row_ops;
+  bool split_row_ops;
   bool extend_matrices;
   bool convert_addition;
   bool remove_assignments;
@@ -63,6 +64,7 @@ struct NnetOptimizeOptions {
       propagate_in_place(true),
       backprop_in_place(true),
       optimize_row_ops(true),
+      split_row_ops(true),
       extend_matrices(true),
       convert_addition(true),
       remove_assignments(true),
@@ -95,6 +97,10 @@ struct NnetOptimizeOptions {
     opts->Register("optimize-row-ops", &optimize_row_ops, "Set to false to "
                    "disable certain optimizations that act on operations of "
                    "type *Row*.");
+    opts->Register("split-row-ops", &split_row_ops, "Set to false to disable "
+                   "an optimization that may replace some operations of type "
+                   "kCopyRowsMulti or kAddRowsMulti with up to two simpler "
+                   "operations.");
     opts->Register("convert-addition", &convert_addition, "Set to false to "
                    "disable the optimization that converts Add commands into "
                    "Copy commands wherever possible.");
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index 48a97df9ea1..e68321b3260 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -679,26 +679,36 @@ void GenerateConfigSequenceLstmWithTruncation(
   }
   std::string spliced_input = temp_string_stream.str();
 
-  std::string c_tminus1 = "IfDefined(Offset(c_t, -1))";
+  int32 offset = RandInt(-3, 3);
+  if (offset == 0)
+    offset = -1;
+
+
+  std::string c_tminus1;
+  {
+    std::ostringstream os_temp;
+    os_temp << "IfDefined(Offset(c_t, " << offset << "))";
+    c_tminus1 = os_temp.str();
+  }
   os << "component-node name=c_t component=c input=Sum(c1_t, c2_t)\n";
 
   // i_t
   os << "component-node name=i1 component=Wi-xr input=Append("
-     << spliced_input << ", IfDefined(Offset(r_t, -1)))\n";
+     << spliced_input << ", IfDefined(Offset(r_t, " << offset << ")))\n";
   os << "component-node name=i2 component=Wic "
      << " input=" << c_tminus1 << std::endl;
   os << "component-node name=i_t component=i input=Sum(i1, i2)\n";
 
   // f_t
   os << "component-node name=f1 component=Wf-xr input=Append("
-     << spliced_input << ", IfDefined(Offset(r_t, -1)))\n";
+     << spliced_input << ", IfDefined(Offset(r_t, " << offset << ")))\n";
   os << "component-node name=f2 component=Wfc "
      << " input=" << c_tminus1 << std::endl;
   os << "component-node name=f_t component=f input=Sum(f1, f2)\n";
 
   // o_t
   os << "component-node name=o1 component=Wo-xr input=Append("
-     << spliced_input << ", IfDefined(Offset(r_t, -1)))\n";
+     << spliced_input << ", IfDefined(Offset(r_t, " << offset << ")))\n";
   os << "component-node name=o2 component=Woc input=Sum(c1_t, c2_t)\n";
   os << "component-node name=o_t component=o input=Sum(o1, o2)\n";
 
@@ -707,7 +717,7 @@ void GenerateConfigSequenceLstmWithTruncation(
 
   // g_t
   os << "component-node name=g1 component=Wc-xr input=Append("
-     << spliced_input << ", IfDefined(Offset(r_t, -1)))\n";
+     << spliced_input << ", IfDefined(Offset(r_t, " << offset << ")))\n";
   os << "component-node name=g_t component=g input=g1\n";
 
   // parts of c_t
@@ -758,6 +768,10 @@ void GenerateConfigSequenceLstmType2(
       cell_dim = 40 + Rand() % 50,
       projection_dim = std::ceil(cell_dim / (Rand() % 10 + 2));
 
+  int32 offset = RandInt(-3, 3);
+  if (offset == 0)
+    offset = -1;
+
   os << "input-node name=input dim=" << input_dim << std::endl;
   // Parameter Definitions W*(* replaced by - to have valid names)
   os << "component name=W-x type=NaturalGradientAffineComponent input-dim="
@@ -819,10 +833,13 @@ void GenerateConfigSequenceLstmType2(
   }
   os << ")\n";
 
-  os << "component-node name=W-r component=W-r input=IfDefined(Offset(r_t, -1))\n";
+  os << "component-node name=W-r component=W-r input=IfDefined(Offset(r_t"
+     << offset << "))\n";
   os << "component-node name=W-m component=W-m input=m_t \n";
-  os << "component-node name=Wic component=Wic input=IfDefined(Offset(c_t, -1))\n";
-  os << "component-node name=Wfc component=Wfc input=IfDefined(Offset(c_t, -1))\n";
+  os << "component-node name=Wic component=Wic input=IfDefined(Offset(c_t"
+     << offset << "))\n";
+  os << "component-node name=Wfc component=Wfc input=IfDefined(Offset(c_t"
+     << offset << "))\n";
   os << "component-node name=Woc component=Woc input=c_t\n";
 
   // Splitting the outputs of W*m node
@@ -857,7 +874,8 @@ void GenerateConfigSequenceLstmType2(
   os << "component-node name=i_t component=i_t input=Sum(W_ix-x_t, Sum(W_ir-r_tminus1, Wic))\n";
   os << "component-node name=f_t component=f_t input=Sum(W_fx-x_t, Sum(W_fr-r_tminus1, Wfc))\n";
   os << "component-node name=o_t component=o_t input=Sum(W_ox-x_t, Sum(W_or-r_tminus1, Woc))\n";
-  os << "component-node name=f_t-c_tminus1 component=f_t-c_tminus1 input=Append(f_t, Offset(c_t, -1))\n";
+  os << "component-node name=f_t-c_tminus1 component=f_t-c_tminus1 input=Append(f_t, Offset(c_t"
+     << offset << "))\n";
   os << "component-node name=i_t-g component=i_t-g input=Append(i_t, g)\n";
   os << "component-node name=m_t component=m_t input=Append(o_t, h)\n";
 
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 2874d6fd14e..fa6ade55864 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -1867,19 +1867,7 @@ class ModelCollapser {
 void CollapseModel(const CollapseModelConfig &config,
                    Nnet *nnet) {
   ModelCollapser c(config, nnet);
-  std::string info_before_collapse;
-  if (GetVerboseLevel() >= 4)
-    info_before_collapse = nnet->Info();
   c.Collapse();
-  if (GetVerboseLevel() >= 4) {
-    std::string info_after_collapse = nnet->Info();
-    if (info_after_collapse != info_before_collapse) {
-      KALDI_VLOG(4) << "Collapsing model: info before collapse was: "
-                    << info_before_collapse
-                    << ", info after collapse was:"
-                    << info_after_collapse;
-    }
-  }
 }
 
 bool UpdateNnetWithMaxChange(const Nnet &delta_nnet,
diff --git a/src/nnet3bin/nnet3-compute.cc b/src/nnet3bin/nnet3-compute.cc
index d6b4b1ded5d..3cd56ef1c74 100644
--- a/src/nnet3bin/nnet3-compute.cc
+++ b/src/nnet3bin/nnet3-compute.cc
@@ -39,7 +39,7 @@ int main(int argc, char *argv[]) {
         "If --apply-exp=true, apply the Exp() function to the output "
         "before writing it out.\n"
         "\n"
-        "Usage: nnet3-compute [options] <raw-nnet-in> <features-rspecifier> <matrix-wspecifier>\n"
+        "Usage: nnet3-compute [options] <nnet-in> <features-rspecifier> <matrix-wspecifier>\n"
         " e.g.: nnet3-compute final.raw scp:feats.scp ark:nnet_prediction.ark\n"
         "See also: nnet3-compute-from-egs\n";
 
@@ -49,7 +49,7 @@ int main(int argc, char *argv[]) {
     NnetSimpleComputationOptions opts;
     opts.acoustic_scale = 1.0; // by default do no scaling in this recipe.
 
-    bool apply_exp = false;
+    bool apply_exp = false, use_priors = false;
     std::string use_gpu = "yes";
 
     std::string word_syms_filename;
@@ -74,6 +74,9 @@ int main(int argc, char *argv[]) {
                 "output");
     po.Register("use-gpu", &use_gpu,
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
+    po.Register("use-priors", &use_priors, "If true, subtract the logs of the "
+                "priors stored with the model (in this case, "
+                "a .mdl file is expected as input).");
 
     po.Read(argc, argv);
 
@@ -90,12 +93,26 @@ int main(int argc, char *argv[]) {
                 feature_rspecifier = po.GetArg(2),
                 matrix_wspecifier = po.GetArg(3);
 
-    Nnet nnet;
-    ReadKaldiObject(nnet_rxfilename, &nnet);
+    Nnet raw_nnet;
+    AmNnetSimple am_nnet;
+    if (use_priors) {
+      bool binary;
+      TransitionModel trans_model;
+      Input ki(nnet_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+    } else {
+      ReadKaldiObject(nnet_rxfilename, &raw_nnet);
+    }
+    Nnet &nnet = (use_priors ? am_nnet.GetNnet() : raw_nnet);
     SetBatchnormTestMode(true, &nnet);
     SetDropoutTestMode(true, &nnet);
     CollapseModel(CollapseModelConfig(), &nnet);
 
+    Vector<BaseFloat> priors;
+    if (use_priors)
+      priors = am_nnet.Priors();
+
     RandomAccessBaseFloatMatrixReader online_ivector_reader(
         online_ivector_rspecifier);
     RandomAccessBaseFloatVectorReaderMapped ivector_reader(
@@ -139,7 +156,6 @@ int main(int argc, char *argv[]) {
         }
       }
 
-      Vector<BaseFloat> priors;
       DecodableNnetSimple nnet_computer(
           opts, nnet, priors,
           features, &compiler,
diff --git a/tools/extras/install_beamformit.sh b/tools/extras/install_beamformit.sh
index db767682467..e61b6645c36 100755
--- a/tools/extras/install_beamformit.sh
+++ b/tools/extras/install_beamformit.sh
@@ -5,7 +5,7 @@
 # libsndfile needed by beamformit
 [ ! -f libsndfile-1.0.25.tar.gz ] && \
   wget http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.25.tar.gz
-[ ! -d liblbfgs-1.10] && \
+[ ! -d libsndfile-1.0.25 ] && \
   tar xzf libsndfile-1.0.25.tar.gz
 (
   cd libsndfile-1.0.25