diff --git a/egs/material/README b/egs/material/README
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs/material/s5/README b/egs/material/s5/README
new file mode 100644
index 00000000000..0eb112493a4
--- /dev/null
+++ b/egs/material/s5/README
@@ -0,0 +1,35 @@
+About the MATERIAL corpus:
+
+The MATERIAL project:
+https://www.iarpa.gov/index.php/research-programs/material
+https://www.nist.gov/itl/iad/mig/openclir-evaluation
+
+The speech data in the MATERIAL corpus consist of four data sets for each
+language: train (BUILD), development (BUILD-dev), test (ANALYSIS1 and ANALYSIS2),
+and unlabeled evaluation audio (EVAL{1,2,3}). The train, development, test, and
+evaluation data contain around 40, 10, 20, and 250 hours of audio respectively.
+The train set is transcribed conversational audio that can be used for training
+an ASR system. It consists of some in 8-bit a-law .sph (Sphere) files and some
+in .wav files with 24-bit samples. The development set is transcribed
+conversational audio that can be used as development data for training to tune
+model parameters. The test data come in long unsegmented files. The reference
+transcripts for the test set is provided, hence, one can measure WER on the test
+set. The evaluation set is untranscribed audio that can be used for
+semi-supervised training of the acoustic model.
+Conversational speech data in the train and test sets are two-channel audio with
+the two channels temporally aligned. Each audio channel is provided and
+transcribed as a separate file, identified as inLine or outLine channel. Both
+audio channels are interleaved in a single file and a there is a single
+interleaved transcript that reflects the temporal alignments. In addition to
+conversational speech, the test and evlatuion sets also contain other
+genres of speech, namely news broadcast and topical broadcast, which are
+single channel files.
+
+
+Running the recipe:
+
+In s5)
+./run.sh --language <swahili|tagalog|somali>
+./local/chain/run_tdnn.sh
+./local/chain/decode_test.sh --language <swahili|tagalog|somali>
+./local/rnnlm/run_tdnn_lstm.sh
diff --git a/egs/material/s5/RESULTS b/egs/material/s5/RESULTS
new file mode 100644
index 00000000000..546f1630698
--- /dev/null
+++ b/egs/material/s5/RESULTS
@@ -0,0 +1,51 @@
+WER results for supervised and semi-supervised acoustic model training
+
+Baseline: GMM training to create alignments and lattice-free MMI-trained neural
+network with factorized TDNN. The BUILD package labeled audio is used for
+supervised acoustic model training, the EVALs unlabeled audio is added for
+semi-supervised acoustic model training.
+
+Source-side bitext on the BUILD package and crawled monolingual data are used in
+building the n-gram LM, RNNLM re-scoring, as well as extending the baseline lexicon.
+
+
+Results for *supervised* acoustic model training:
+
+Swahili
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   36.8    36.7    38.9
+ANALYSIS1   42.5    41.3    41.4
+ANALYSIS2   38.1    36.8    36.9
+
+Tagalog
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   46.4    46.1    47.5
+ANALYSIS1   52.1    51.0    50.9
+ANALYSIS2   53.6    52.3    52.2
+
+Somali
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   57.4    56.5    57.8
+ANALYSIS1   61.6    57.8    57.7
+ANALYSIS2   59.3    55.5    55.3
+
+
+Results for *semi-supervised* acoustic model training:
+
+Swahili
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   35.3    35.1    36.7
+ANALYSIS1   35.2    34.5    34.7
+ANALYSIS2   30.8    30.0    30.1
+
+Tagalog
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   45.0    45.2    46.6
+ANALYSIS1   40.8    40.1    40.1
+ANALYSIS2   41.1    40.6    40.6
+
+Somali
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   56.8    56.3    57.7
+ANALYSIS1   50.6    48.8    48.6
+ANALYSIS2   49.8    48.2    48.2
diff --git a/egs/material/s5/cmd.sh b/egs/material/s5/cmd.sh
new file mode 100644
index 00000000000..2bb1c6d24f5
--- /dev/null
+++ b/egs/material/s5/cmd.sh
@@ -0,0 +1,14 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="retry.pl --num-tries 3 queue.pl --mem 8G"
diff --git a/egs/material/s5/conf/decode.config b/egs/material/s5/conf/decode.config
new file mode 100644
index 00000000000..7ba966f2b83
--- /dev/null
+++ b/egs/material/s5/conf/decode.config
@@ -0,0 +1 @@
+# empty config, just use the defaults.
diff --git a/egs/material/s5/conf/lang/somali.conf b/egs/material/s5/conf/lang/somali.conf
new file mode 100755
index 00000000000..999c4c0ef14
--- /dev/null
+++ b/egs/material/s5/conf/lang/somali.conf
@@ -0,0 +1,26 @@
+# speech corpora files location
+# the user should replace the values with the ones that work for their location
+corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/BUILD/
+# test audio files to decode
+audio_path_analysis1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/ANALYSIS1/audio/
+audio_path_analysis2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/ANALYSIS2/audio/
+audio_path_dev=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/DEV/audio/
+audio_path_eval1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/EVAL1/audio/
+audio_path_eval2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/EVAL2/audio/
+audio_path_eval3=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/EVAL3/audio/
+# bitext file location
+bitext=$corpus/bitext/MATERIAL_BASE-1S-BUILD_bitext.txt
+mono=/home/pkoehn/statmt/data/site-crawl/corpus/paracrawl-release3.2018-11-05.en-so.zipporah-20-dedup.lang-filtered.so
+mono2=/home/pkoehn/statmt/data/data.statmt.org/lm/so.filtered.tok.gz
+# number_mapping is a 2-column file consisting of the numbers written as digits (1st column) and letters (2nd column)
+number_mapping=/home/pkoehn/experiment/material-asr-so-en/scripts/somali_1_9999.txt
+# Acoustic model parameters
+numShorestUtts=40000
+numLeavesTri1=2000
+numGaussTri1=30000
+numLeavesTri2=3000
+numGaussTri2=60000
+numLeavesTri3=6000
+numGaussTri3=80000
+
+
diff --git a/egs/material/s5/conf/lang/swahili.conf b/egs/material/s5/conf/lang/swahili.conf
new file mode 100755
index 00000000000..d90f4c2abd7
--- /dev/null
+++ b/egs/material/s5/conf/lang/swahili.conf
@@ -0,0 +1,26 @@
+# speech corpora files location
+# the user should replace the values with the ones that work for their location
+corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/
+# test audio files to decode
+audio_path_analysis1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS1/audio/
+audio_path_analysis2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS2/audio/
+audio_path_dev=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/DEV/audio/
+audio_path_eval1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/EVAL1/audio/
+audio_path_eval2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/EVAL2/audio/
+audio_path_eval3=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/EVAL3/audio/
+# bitext file location
+bitext=$corpus/bitext/MATERIAL_BASE-1A-BUILD_bitext.txt
+mono=/home/pkoehn/statmt/data/site-crawl/mono-corpus/mono.2018-04-24.sw
+mono2=
+# number_mapping is a 2-column file consisting of the numbers written as digits (1st column) and letters (2nd column)
+number_mapping=/home/pkoehn/experiment/material-asr-so-en/scripts/swahili_1_9999.txt
+# Acoustic model parameters
+numShorestUtts=40000
+numLeavesTri1=2000
+numGaussTri1=30000
+numLeavesTri2=3000
+numGaussTri2=60000
+numLeavesTri3=6000
+numGaussTri3=80000
+
+
diff --git a/egs/material/s5/conf/lang/tagalog.conf b/egs/material/s5/conf/lang/tagalog.conf
new file mode 100644
index 00000000000..238979feb3f
--- /dev/null
+++ b/egs/material/s5/conf/lang/tagalog.conf
@@ -0,0 +1,26 @@
+# speech corpora files location
+# the user should replace the values with the ones that work for their location
+corpus=/home/pkoehn/experiment/material-asr-so-en/scripts/swahili_1_9999.txt
+# test audio files to decode
+audio_path_analysis1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/ANALYSIS1/audio/
+audio_path_analysis2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/ANALYSIS2/audio/
+audio_path_dev=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/DEV/audio/
+audio_path_eval1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/EVAL1/audio/
+audio_path_eval2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/EVAL2/audio/
+audio_path_eval3=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/EVAL3/audio/
+# bitext file location
+bitext=$corpus/bitext/MATERIAL_BASE-1B-BUILD_bitext.txt
+mono=/home/pkoehn/statmt/data/site-crawl/mono-corpus/mono.2018-04-24.tl
+mono2=
+# number_mapping is a 2-column file consisting of the numbers written as digits (1st column) and letters (2nd column)
+number_mapping=
+# Acoustic model parameters
+numShorestUtts=45000
+numLeavesTri1=4000
+numGaussTri1=60000
+numLeavesTri2=5000
+numGaussTri2=80000
+numLeavesTri3=7000
+numGaussTri3=100000
+
+
diff --git a/egs/material/s5/conf/mfcc.conf b/egs/material/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..e6defc10078
--- /dev/null
+++ b/egs/material/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false
+--sample-frequency=8000 
diff --git a/egs/material/s5/conf/mfcc_hires.conf b/egs/material/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..f218143e78a
--- /dev/null
+++ b/egs/material/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 # most of the files are 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
diff --git a/egs/material/s5/conf/online_cmvn.conf b/egs/material/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/material/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/material/s5/conf/plp.conf b/egs/material/s5/conf/plp.conf
new file mode 100644
index 00000000000..926bcfca92a
--- /dev/null
+++ b/egs/material/s5/conf/plp.conf
@@ -0,0 +1 @@
+--sample-frequency=8000
diff --git a/egs/material/s5/local/audio2wav_scp.pl b/egs/material/s5/local/audio2wav_scp.pl
new file mode 100755
index 00000000000..f051c2714d2
--- /dev/null
+++ b/egs/material/s5/local/audio2wav_scp.pl
@@ -0,0 +1,55 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+
+my $sox =  `which sox` or die "The sox binary does not exist";
+chomp $sox;
+my $sph2pipe = `which sph2pipe` or die "The sph2pipe binary does not exist";
+chomp $sph2pipe;
+
+while(<STDIN>) {
+  chomp;
+  my $full_path = $_;
+  (my $basename = $full_path) =~ s/.*\///g;
+
+  die "The filename $basename does not match the expected naming pattern!" unless $basename =~ /.*\.(wav|sph)$/;
+  (my $ext = $basename) =~ s/.*\.(wav|sph)$/$1/g;
+  (my $name = $basename) =~ s/(.*)\.(wav|sph)$/$1/g;
+
+
+  # name looks like this:
+  #   MATERIAL_BASE-1A-BUILD_10002_20131130_011225_inLine.sph
+  # Please note that the naming pattern must match
+  # the pattern in create_datafiles.pl
+  $name =~ s/inLine.*/0/g;
+  $name =~ s/outLine.*/1/g;
+  $name =~ s/_BASE//g;
+  $name =~ s/-BUILD//g;
+
+  if ($ext eq "wav") {
+    print "$name $sox $full_path -r 8000 -c 1 -b 16 -t wav - downsample|\n";
+  } else {
+    print "$name $sph2pipe -f wav -p -c 1 $full_path|\n";
+  }
+}
+
+
diff --git a/egs/material/s5/local/chain/decode_test.sh b/egs/material/s5/local/chain/decode_test.sh
new file mode 100755
index 00000000000..40115a04cf6
--- /dev/null
+++ b/egs/material/s5/local/chain/decode_test.sh
@@ -0,0 +1,171 @@
+#!/bin/bash
+
+# Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+#           2018  Mahsa Yarmohammadi
+#           2018  Yiming Wang
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+language=swahili
+stage=0
+datadev="data/analysis1 data/analysis2 data/test_dev data/eval1 data/eval2 data/eval3"
+dir=exp/chain/tdnn1b_sp
+lang=data/lang_combined_chain
+tree_dir=exp/chain/tree_sp
+cmd=queue.pl
+graph_affix=_combined
+
+# training options
+chunk_width=140,100,160
+chunk_left_context=0
+chunk_right_context=0
+
+# ivector options
+max_count=75 # parameter for extract_ivectors.sh
+sub_speaker_frames=600
+filter_ctm=true
+weights_file=
+silence_weight=0.00001
+nj=30
+
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ ! -f ./conf/lang/${language}.conf ] && \
+  echo "Language configuration conf/lang/${language}.conf does not exist!" && exit 1
+ln -sf ./conf/lang/${language}.conf lang.conf                                   
+. ./lang.conf
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+if [ $stage -le 1 ]; then
+  # extract hires mfcc features from uniformly segmented data
+  for datadir in $datadev; do
+    utils/copy_data_dir.sh ${datadir}_segmented ${datadir}_segmented_hires
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" ${datadir}_segmented_hires || exit 1;
+    steps/compute_cmvn_stats.sh ${datadir}_segmented_hires || exit 1;
+    utils/fix_data_dir.sh ${datadir}_segmented_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # extract iVectors for the test data, in this case we don't need the speed
+  # perturbation (sp).
+  for datadir in $datadev; do
+    data=$(basename $datadir)
+    steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \
+      --silence-weight $silence_weight \
+      --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
+      ${datadir}_segmented_hires $lang exp/nnet3/extractor \
+      exp/nnet3/ivectors_${data}_segmented_hires
+  done
+fi
+
+frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+rm $dir/.error 2>/dev/null || true
+
+if [ $stage -le 3 ]; then
+  # do the 1st pass decoding
+  for datadir in $datadev; do
+    (
+      data=$(basename $datadir)
+      nspk=$(wc -l <data/${data}_segmented_hires/spk2utt)
+      decode_dir=${dir}/decode_${data}_segmented
+      steps/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --extra-left-context $chunk_left_context \
+        --extra-right-context $chunk_right_context \
+        --extra-left-context-initial 0 \
+        --extra-right-context-final 0 \
+        --frames-per-chunk $frames_per_chunk \
+        --skip-scoring true \
+        --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3/ivectors_${data}_segmented_hires \
+        $tree_dir/graph${graph_affix} ${datadir}_segmented_hires ${decode_dir} || exit 1
+
+      # resolve ctm overlaping regions, and compute wer
+      local/postprocess_test.sh ${data}_segmented ${tree_dir}/graph${graph_affix} \
+        ${decode_dir}
+    ) || touch $dir/.error &
+  done
+fi
+wait
+
+if [ $stage -le 4 ]; then
+  # re-segement data based on 1st-pass decoding
+  segmentation_opts="--silence-proportion 0.2 --max-segment-length 15 --frame-shift 0.03"
+  for datadir in $datadev; do
+    data=$(basename $datadir)
+    # get alignment from lattice
+    nj_ali=`cat ${dir}/decode_${data}_segmented/num_jobs` || exit 1;
+    $cmd JOB=1:${nj_ali} ${dir}/decode_${data}_segmented/log/generate_alignments.JOB.log \
+    lattice-best-path --acoustic-scale=0.2 \
+    "ark:gunzip -c ${dir}/decode_${data}_segmented/lat.JOB.gz |" \
+    ark:/dev/null "ark:|gzip -c >${dir}/decode_${data}_segmented/ali.JOB.gz" || exit 1;
+
+    cp $lang/phones.txt ${dir}/decode_${data}_segmented || exit 1;
+
+    steps/resegment_data.sh --segmentation-opts "$segmentation_opts" ${datadir}_segmented_hires $lang \
+      ${dir}/decode_${data}_segmented ${datadir}_segmented_reseg_hires_tmp exp/resegment_${data}_segmented
+
+    utils/data/subsegment_data_dir.sh ${datadir}_segmented_hires ${datadir}_segmented_reseg_hires_tmp/segments \
+      ${datadir}_segmented_reseg_hires
+
+    rm -rf ${datadir}_segmented_reseg_hires_tmp 2>/dev/null || true
+
+    echo "Extracting i-vectors, stage 2"
+    # this does offline decoding, except we estimate the iVectors per
+    # speaker, excluding silence (based on alignments from a DNN decoding), with a
+    # different script.  This is just to demonstrate that script.
+    # the --sub-speaker-frames is optional; if provided, it will divide each speaker
+    # up into "sub-speakers" of at least that many frames... can be useful if
+    # acoustic conditions drift over time within the speaker's data.
+    steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \
+      --silence-weight $silence_weight \
+      --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
+      ${datadir}_segmented_reseg_hires $lang exp/nnet3/extractor \
+      exp/nnet3/ivectors_${data}_segmented_reseg_hires;
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  # 2nd-pass decoding on the resegmented data
+  for datadir in $datadev; do
+    (
+      data=$(basename $datadir)
+      nspk=$(wc -l <data/${data}_segmented_reseg_hires/spk2utt)
+      decode_dir=${dir}/decode_${data}_segmented_reseg
+      steps/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --extra-left-context $chunk_left_context \
+        --extra-right-context $chunk_right_context \
+        --extra-left-context-initial 0 \
+        --extra-right-context-final 0 \
+        --frames-per-chunk $frames_per_chunk \
+        --skip-scoring true \
+        --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3/ivectors_${data}_segmented_reseg_hires \
+        $tree_dir/graph${graph_affix} ${datadir}_segmented_reseg_hires ${decode_dir} || exit 1
+
+      # resolve ctm overlaping regions, and compute wer
+      local/postprocess_test.sh ${data}_segmented_reseg $tree_dir/graph${graph_affix} \
+        ${decode_dir}
+    ) || touch $dir/.error &
+  done
+fi
+wait
+[ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+
+exit 0;
diff --git a/egs/material/s5/local/chain/run_tdnn.sh b/egs/material/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..61f8f499182
--- /dev/null
+++ b/egs/material/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1b.sh
\ No newline at end of file
diff --git a/egs/material/s5/local/chain/run_tdnn_lstm.sh b/egs/material/s5/local/chain/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/material/s5/local/chain/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..4f38ee886a7
--- /dev/null
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,276 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#           2017-2018  Yiming Wang
+
+# 1a is trying an architecture with factored parameter matrices with dropout.
+
+# cat exp/chain/tdnn1a_sp/decode_dev/scoring_kaldi/best_wer
+# [for swahili]
+# %WER 38.65 [ 24021 / 62144, 3044 ins, 6378 del, 14599 sub ] exp/chain/tdnn1a_sp/decode_dev/wer_9_0.5
+# [for tagalog]
+# %WER 46.53 [ 29955 / 64382, 3425 ins, 9485 del, 17045 sub ] exp/chain/tdnn1a_sp/decode_dev/wer_9_0.0
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp
+# [for swahili]
+# exp/chain/tdnn1a_sp: num-iters=99 nj=2..12 num-params=12.2M dim=40+100->1792 xent:train/valid[65,98,final]=(-1.93,-1.66,-1.68/-2.05,-1.84,-1.83) logprob:train/valid[65,98,final]=(-0.199,-0.166,-0.167/-0.225,-0.208,-0.206)
+# [for tagalog]
+# exp/chain/tdnn1a_sp: num-iters=96 nj=2..12 num-params=12.3M dim=40+100->1952 combine=-0.165->-0.165 (over 2) xent:train/valid[63,95,final]=(-1.89,-1.66,-1.65/-2.06,-1.89,-1.89) logprob:train/valid[63,95,final]=(-0.186,-0.158,-0.157/-0.231,-0.219,-0.218)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train
+test_sets="dev"
+gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+tree_affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+get_egs_stage=-10
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+num_epochs=7
+
+# training options
+srand=0
+remove_egs=true
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --nnet3-affix "$nnet3_affix" || exit 1;
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang_combined=data/lang_combined_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating lang directory $lang_combined with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang_combined ]; then
+    if [ $lang_combined/L.fst -nt data/lang_combined_test/L.fst ]; then
+      echo "$0: $lang_combined already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang_combined already exists and seems to be older than data/lang_combined_test ..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang_combined_test $lang_combined
+    silphonelist=$(cat $lang_combined/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang_combined/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang_combined/topo
+  fi
+fi
+
+if [ $stage -le 8 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang_combined $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 6000 ${lores_train_data_dir} \
+    $lang_combined $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 10 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.01 dropout-per-dim=true dropout-per-dim-continuous=true"
+  linear_opts="orthonormal-constraint=1.0"
+  output_opts="l2-regularize=0.005"
+
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=768
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=768
+  linear-component name=tdnn3l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=768
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=768
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=768 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1024
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=768
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1024
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=768
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1024
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=768
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1024
+  output-layer name=output include-log-softmax=false dim=$num_targets bottleneck-dim=256 $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1024
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor bottleneck-dim=256 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph || exit 1;
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_combined_test \
+    $tree_dir ${tree_dir}/graph_combined || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)  
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_combined data/${data}_hires ${dir}/decode_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/material/s5/local/chain/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..023cb34b43d
--- /dev/null
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -0,0 +1,297 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#           2017-2018  Yiming Wang
+
+# 1a is trying an architecture with factored parameter matrices with dropout.
+
+# cat exp/chain/tdnn1b_sp/decode_dev/scoring_kaldi/best_wer
+# [for swahili]
+# %WER 36.84 [ 22893 / 62144, 2988 ins, 5712 del, 14193 sub ] exp/chain/tdnn1b_sp/decode_dev/wer_10_0.0
+# [for tagalog]
+# %WER 46.37 [ 29852 / 64382, 4163 ins, 7652 del, 18037 sub ] exp/chain/tdnn1b_sp/decode_dev/wer_10_0.0
+# [for somali]
+# %WER 57.44 [ 46889 / 81637, 5016 ins, 12015 del, 29858 sub ] exp/chain/tdnn1b_sp/decode_dev/wer_10_0.0
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1b_sp
+
+# [for swahili]
+# exp/chain/tdnn1b_sp/: num-iters=99 nj=2..12 num-params=17.2M dim=40+100->1816
+# combine=-0.127->-0.127 (over 2) xent:train/valid[65,98,final]=(-1.74,-1.44,-1.43/-1.80,-1.62,-1.61)
+# logprob:train/valid[65,98,final]=(-0.175,-0.136,-0.135/-0.194,-0.182,-0.180)
+
+# [for tagalog]
+# exp/chain/tdnn1b_sp/: num-iters=96 nj=2..12 num-params=17.2M dim=40+100->1928 combine=-0.124->-0.123
+# (over 2) xent:train/valid[63,95,final]=(-1.69,-1.43,-1.42/-1.75,-1.62,-1.60) 
+# logprob:train/valid[63,95,final]=(-0.168,-0.128,-0.127/-0.193,-0.187,-0.187)
+
+# [for somali]
+# exp/chain/tdnn1b_sp/: num-iters=84 nj=2..12 num-params=17.9M dim=40+100->3240 combine=-0.162->-0.160 
+# (over 2) xent:train/valid[55,83,final]=(-2.31,-2.02,-2.00/-2.27,-2.13,-2.10)
+# logprob:train/valid[55,83,final]=(-0.218,-0.157,-0.154/-0.268,-0.263,-0.263)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train
+test_sets="dev"
+gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1b   #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration.
+tree_affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+get_egs_stage=-10
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+num_epochs=7
+
+# training options
+srand=0
+remove_egs=true
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --nnet3-affix "$nnet3_affix" || exit 1;
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang_combined=data/lang_combined_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating lang directory $lang_combined with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang_combined ]; then
+    if [ $lang_combined/L.fst -nt data/lang_combined_test/L.fst ]; then
+      echo "$0: $lang_combined already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang_combined already exists and seems to be older than data/lang_combined_test ..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang_combined_test $lang_combined
+    silphonelist=$(cat $lang_combined/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang_combined/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang_combined/topo
+  fi
+fi
+
+if [ $stage -le 8 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang_combined $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 6000 ${lores_train_data_dir} \
+    $lang_combined $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 10 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1024
+  linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1024
+  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1024 input=Append(0,1)
+  linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1024
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1024 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1024
+  linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1024
+  linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1024
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280
+  linear-component name=prefinal-chain-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-chain-batchnorm
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280
+  linear-component name=prefinal-xent-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-xent-batchnorm
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+  
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_combined_test \
+    $tree_dir ${tree_dir}/graph_combined || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_combined data/${data}_hires ${dir}/decode_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..af5a62dad0d
--- /dev/null
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,268 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#           2017-2018  Yiming Wang
+
+# tdnn-lstm recipe
+# [for swahili]
+# cat exp/chain/tdnn_lstm1a_sp/decode_dev/scoring_kaldi/best_wer
+# %WER 39.12 [ 24312 / 62144, 3118 ins, 5952 del, 15242 sub ] exp/chain/tdnn_lstm1a_sp/decode_dev/wer_9_0.5
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_lstm1a_sp
+# exp/chain/tdnn_lstm1a_sp: num-iters=70 nj=2..12 num-params=10.9M dim=40+100->1792 combine=-0.176->-0.174 (over 6) xent:train/valid[45,69,final]=(-1.71,-1.52,-1.50/-1.81,-1.69,-1.67) logprob:train/valid[45,69,final]=(-0.185,-0.160,-0.159/-0.213,-0.208,-0.205)
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=30
+train_set=train
+test_sets=dev
+gmm=tri3
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+tlstm_affix=1a   # affix for the TDNN-LSTM directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang_combined=data/lang_combined_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating lang directory $lang_combined with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang_combined ]; then
+    if [ $lang_combined/L.fst -nt data/lang_combined_test/L.fst ]; then
+      echo "$0: $lang_combined already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang_combined already exists and seems to be older than data/lang_combined_test ..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang_combined_test $lang_combined
+    silphonelist=$(cat $lang_combined/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang_combined/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang_combined/topo
+  fi
+fi
+
+if [ $stage -le 8 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang_combined $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 6000 ${lores_train_data_dir} \
+    $lang_combined $ali_dir $tree_dir
+fi
+
+if [ $stage -le 10 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  tdnn_opts="l2-regularize=0.02"
+  lstm_opts="l2-regularize=0.005"
+  output_opts="l2-regularize=0.004"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=512
+  relu-batchnorm-layer name=tdnn2 $tdnn_opts input=Append(-1,0,1) dim=512
+  relu-batchnorm-layer name=tdnn3 $tdnn_opts input=Append(-1,0,1) dim=512
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts
+  relu-batchnorm-layer name=tdnn4 $tdnn_opts input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn5 $tdnn_opts input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn6 $tdnn_opts input=Append(-3,0,3) dim=512
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts
+  relu-batchnorm-layer name=tdnn7 $tdnn_opts input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn8 $tdnn_opts input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn9 $tdnn_opts input=Append(-3,0,3) dim=512
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=5 \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.deriv-truncate-margin=8 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph || exit 1;
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_combined_test \
+    $tree_dir ${tree_dir}/graph_combined || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_combined data/${data}_hires ${dir}/decode_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/material/s5/local/cleanup_transcripts.pl b/egs/material/s5/local/cleanup_transcripts.pl
new file mode 100755
index 00000000000..6cd237c5b7e
--- /dev/null
+++ b/egs/material/s5/local/cleanup_transcripts.pl
@@ -0,0 +1,90 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+binmode STDIN, "utf8";
+binmode STDOUT, "utf8";
+binmode STDERR, "utf8";
+
+# replacement of the smart-match operator (apparently not supported anymore)
+sub is_elem {
+  my $word = shift;
+  my $array = shift;
+  foreach my $other_word (@{$array}) {
+    return 1 if $word eq $other_word;
+  }
+  return 0;
+}
+
+my $unk = "<unk>";
+my $noise = "<noise>";
+my $spnoise = "<spnoise>";
+my $sil = "<sil>";
+
+my @ignore_events = ("<female-to-male>", "<male-to-female>");
+#as per the BABEL docs, ~ means truncation of the word/utterance
+my @ignore_utt_events = ("<overlap>", "<dtmf>", "<foreign>", "~");
+my @sil_events = ("<no-speech>");
+my @noise_events = ("<sta>", "<ring>", "<int>" );
+my @spnoise_events = ("<breath>", "<cough>", "<hes>", "<laugh>", "<click>", "<lipsmack>");
+
+
+
+UTT: while(<>) {
+  chomp;
+  my @line = split " ", $_;
+  my $file = shift @line;
+  my $begin = shift @line;
+  my $end = shift @line;
+
+  next if (@line == 1) and ($line[0] eq "<no-speech>");
+  next if (@line == 1) and ($line[0] =~ "<.*>"); #skip the utterance if all
+                                                 #it contains is a non-speech event
+
+  my @out_line;
+  foreach my $word (@line) {
+    if ($word =~ /.*-$/) {
+      push @out_line, $unk;
+    } elsif ($word =~ /^-.*/) {
+      push @out_line, $unk;
+    } elsif ($word =~ /^\*.*\*$/) {
+      push @out_line, $unk;
+    } elsif ($word eq "(())") {
+      push @out_line, $unk;
+    } elsif (is_elem $word, \@ignore_events) {
+      next;
+    } elsif (is_elem $word, \@ignore_utt_events) {
+      next UTT;
+    } elsif (is_elem $word, \@sil_events) {
+      push @out_line, $sil;
+    } elsif (is_elem $word, \@noise_events) {
+      push @out_line, $noise;
+    } elsif (is_elem $word, \@spnoise_events) {
+      push @out_line, $spnoise;
+    } else {
+      push @out_line, $word;
+    }
+  }
+  print "$file\t$begin\t$end\t" . join(" ", @out_line) . "\n" if @out_line;
+
+}
+
+
diff --git a/egs/material/s5/local/convert_lexicon.pl b/egs/material/s5/local/convert_lexicon.pl
new file mode 100755
index 00000000000..1fe7e90ac1f
--- /dev/null
+++ b/egs/material/s5/local/convert_lexicon.pl
@@ -0,0 +1,76 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+my $lexicon_name = $ARGV[0];
+open(my $lexicon_file, "<:encoding(UTF-8)", $lexicon_name) or
+  die "Cannot open $lexicon_name: $!\n";
+
+my $wordlist_name = $ARGV[1];
+open(my $wordlist_file, "<:encoding(UTF-8)", $wordlist_name) or
+  die "Cannot open $wordlist_name: $!\n";
+
+
+my %lexicon;
+while (<$lexicon_file>) {
+  chomp;
+  (my $word, my $prons) = split " ", $_, 2;
+  $lexicon{uc $word} = $prons;
+}
+
+while (<$wordlist_file>) {
+  chomp;
+  my $word = $_;
+  print STDERR "Cannot find word $word in lexicon\n" unless defined($lexicon{uc $word});
+
+  #print "$word $lexicon{$word}\n";
+
+  my @prons = split "\t", $lexicon{uc $word};
+  foreach my $pron (@prons) {
+    my @phones = split " ", $pron;
+    my $stress_mark = 0;
+    my @out_phones = ();
+    foreach my $phone (@phones) {
+      if ($phone eq "\"") {
+        $stress_mark = 1
+      } elsif ( $phone eq "." ) {
+        $stress_mark = 0;
+        push @out_phones, '.';
+      } elsif ( $phone eq "#" ) {
+        $stress_mark = 0;
+        push @out_phones, '.';
+      } else {
+        $phone =~ s/_/+/g;
+        #let's just ignore stress for now
+        #$phone = "${phone}_\"" if $stress_mark;
+        push @out_phones, $phone;
+      }
+    }
+    my $out_pron = join(" ", @out_phones);
+    $out_pron =~ s/ *\. */\t/g;
+    print "$word\t$out_pron\n";
+  }
+}
+
diff --git a/egs/material/s5/local/count_oovs.pl b/egs/material/s5/local/count_oovs.pl
new file mode 100755
index 00000000000..228399f99e3
--- /dev/null
+++ b/egs/material/s5/local/count_oovs.pl
@@ -0,0 +1,81 @@
+#!/usr/bin/perl -W
+
+# (c) 2014  Korbinian Riedhammer
+
+# Count the number of OOV per turn (or speaker, if utt2spk is provided).  Use
+# the --split-words option to split non-ascii words into characters (syllable
+# based languages).
+
+
+use strict;
+use warnings;
+use Getopt::Long;
+use open qw(:std :utf8);
+
+
+my $utt2spkf = "";
+my $split_words = 0;
+
+GetOptions(
+	'utt2spk=s' => \$utt2spkf,
+	'split-words' => \$split_words
+);
+
+if (scalar @ARGV lt 1) {
+	print STDERR "usage:  $0 [--utt2spk=utt2spk] words.txt [input]\n";
+	exit 1;
+}
+
+my $lexf = shift @ARGV;
+
+my %lex = map { my ($a, $b) = split /\s+/; $a => $b; } `cat $lexf`;
+
+my %utt2spk = ();
+if (length $utt2spkf gt 0) {
+	%utt2spk = map { my ($a, $b) = split /\s+/; $a => $b; } `cat $utt2spkf`; #read_file($utt2spkf, binmode => ':utf8');
+}
+
+my %num_words = ();
+my %num_oovs = ();
+my %oov_string = ();
+
+while (<>) {
+	my ($id, @trl) = split /\s+/;
+
+	if (length $utt2spkf gt 0) {
+		if (defined $utt2spk{$id}) {
+			$id = $utt2spk{$id};
+		} else {
+			printf STDERR "Warning: $id not specified in $utt2spkf\n";
+		}
+	}
+
+	$num_words{$id} = 0 unless defined $num_words{$id};
+	$num_oovs{$id} = 0 unless defined $num_oovs{$id};
+	$oov_string{$id} = ""  unless defined $oov_string{$id};
+
+
+	if ($split_words) {
+		for (my $i = 0; $i < scalar @trl; $i++) {
+			my $w = $trl[$i];
+			unless ($w =~ m/[a-zA-Z_\-]/) {
+				my @sw = split //, $w;
+				splice @trl, $i, 1, @sw;
+				$i += (scalar @sw) - 1;
+			}
+		}
+	}
+
+	$num_words{$id} += scalar @trl;
+	for my $w (@trl) {
+		$num_oovs{$id} += 1 unless defined $lex{$w};
+		$oov_string{$id} .= "$w " unless defined $lex{$w};
+	}
+
+}
+
+for my $i (sort keys %num_words) {
+	printf "%s %d %d %s\n", $i, $num_words{$i}, $num_oovs{$i}, 
+		( defined $oov_string{$i} ? $oov_string{$i} : "");
+}
+
diff --git a/egs/material/s5/local/create_datafiles.pl b/egs/material/s5/local/create_datafiles.pl
new file mode 100755
index 00000000000..d8e692524a1
--- /dev/null
+++ b/egs/material/s5/local/create_datafiles.pl
@@ -0,0 +1,69 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+binmode STDIN, "utf8";
+binmode STDOUT, "utf8";
+binmode STDERR, "utf8";
+
+my $output = $ARGV[0];
+open(my $utt2spk, ">:utf8", "$output/utt2spk") or
+  die "Cannot open $output/utt2spk: $!\n";
+open(my $text, ">:utf8", "$output/text") or
+  die "Cannot open $output/text: $!\n";
+open(my $segments, ">:utf8", "$output/segments") or
+  die "Cannot open $output/segments: $!\n";
+open(my $wav, ">:utf8", "$output/wav2file") or
+  die "Cannot open $output/wav2file: $!\n";
+
+my %text2id;
+while(<STDIN>) {
+  chomp;
+  my @line = split (" ", $_, 4);
+  my $name = shift @line;
+  my $begin =  shift @line;
+  my $end = shift @line;
+  my $words = shift @line;
+  my $name_raw = $name;
+
+  my $begin_text = sprintf("%07d", $begin * 1000);
+  my $end_text = sprintf("%07d", $end * 1000);
+
+  # name looks like this:
+  #   MATERIAL_BASE-1A-BUILD_10002_20131130_011225_inLine.txt
+  # Please note that the naming pattern must match
+  # the pattern in audio2wav_scp.pl
+  $name =~ s/inLine.*/0/g;
+  $name =~ s/outLine.*/1/g;
+  $name =~ s/_BASE//g;
+  $name =~ s/-BUILD//g;
+
+  my $utt_name = join("_", $name, $begin_text, $end_text);
+  print $segments "$utt_name $name $begin $end\n";
+  print $utt2spk  "$utt_name $name\n";
+  print $text "$utt_name $words\n";
+  if (defined $text2id{$name}) {
+    die "" if $text2id{$name} ne $name_raw;
+  } else {
+    print $wav "$name $name_raw\n";
+    $text2id{$name} = $name_raw;
+  }
+}
diff --git a/egs/material/s5/local/ctm_filter b/egs/material/s5/local/ctm_filter
new file mode 100755
index 00000000000..fa0f749c92a
--- /dev/null
+++ b/egs/material/s5/local/ctm_filter
@@ -0,0 +1,7 @@
+#!/usr/bin/perl
+
+while (<>) {
+  if ($_ !~ m/<(noise|unk|spnoise|sil)>/i) {
+    print $_;
+  }
+}
diff --git a/egs/material/s5/local/g2p/apply_g2p.sh b/egs/material/s5/local/g2p/apply_g2p.sh
new file mode 100755
index 00000000000..704a1a906bb
--- /dev/null
+++ b/egs/material/s5/local/g2p/apply_g2p.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# Copyright 2016  Allen Guo
+#           2017  Xiaohui Zhang
+# Apache License 2.0
+
+# This script applies a trained Phonetisarus G2P model to
+# synthesize pronunciations for missing words (i.e., words in
+# transcripts but not the lexicon), and output the expanded lexicon.
+
+var_counts=1
+
+. ./path.sh || exit 1
+. parse_options.sh || exit 1;
+
+if [ $# -ne "4" ]; then
+  echo "Usage: $0 <g2p-model> <g2p-tmp-dir> <current-lexicon> <output-lexicon>"
+  exit 1
+fi
+
+model=$1
+workdir=$2
+lexicon=$3
+outlexicon=$4
+
+mkdir -p $workdir
+
+echo 'Synthesizing pronunciations for missing words...'
+phonetisaurus-apply --nbest $var_counts --model $model --thresh 5 --accumulate --word_list $workdir/missing_onlywords.txt > $workdir/missing_g2p_${var_counts}.txt 
+
+echo "Adding new pronunciations to $lexicon"
+cat "$lexicon" $workdir/missing_g2p_${var_counts}.txt | sort | uniq > $outlexicon
diff --git a/egs/material/s5/local/g2p/train_g2p.sh b/egs/material/s5/local/g2p/train_g2p.sh
new file mode 100755
index 00000000000..43e75f6608d
--- /dev/null
+++ b/egs/material/s5/local/g2p/train_g2p.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# Copyright 2017  Intellisist, Inc. (Author: Navneeth K)
+#           2017  Xiaohui Zhang
+# Apache License 2.0
+
+# This script trains a g2p model using Phonetisaurus and SRILM.
+
+stage=0
+silence_phones=
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <dictdir> <outdir>"
+  exit 1;
+fi
+
+lexicondir=$1
+outdir=$2
+
+[ ! -f $lexicondir/lexicon.txt ] && echo "Cannot find $lexicondir/lexicon.txt" && exit
+
+isuconv=`which uconv`
+if [ -z $isuconv ]; then
+  echo "uconv was not found. You must install the icu4c package."
+  exit 1;
+fi
+
+mkdir -p $outdir
+
+
+# For input lexicon, remove pronunciations containing non-utf-8-encodable characters,
+# and optionally remove words that are mapped to a single silence phone from the lexicon.
+if [ $stage -le 0 ]; then
+  lexicon=$lexicondir/lexicon.txt
+  if [ ! -z "$silence_phones" ]; then
+    awk 'NR==FNR{a[$1] = 1; next} {s=$2;for(i=3;i<=NF;i++) s=s" "$i; if(!(s in a)) print $1" "s}' \
+      $silence_phones $lexicon | \
+      awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s\n",$NF);}' | \
+      uconv -f utf-8  -t utf-8 -x Any-NFC - | awk 'NF > 0'> $outdir/lexicon_tab_separated.txt
+  else
+    awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s\n",$NF);}' $lexicon | \
+      uconv -f utf-8  -t utf-8 -x Any-NFC - | awk 'NF > 0'> $outdir/lexicon_tab_separated.txt
+  fi
+fi
+
+if [ $stage -le 1 ]; then
+  # Align lexicon stage. Lexicon is assumed to have first column tab separated
+  phonetisaurus-align --input=$outdir/lexicon_tab_separated.txt --ofile=${outdir}/aligned_lexicon.corpus || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  # Convert aligned lexicon to arpa using srilm.
+  ngram-count -order 7 -kn-modify-counts-at-end -gt1min 0 -gt2min 0 \
+    -gt3min 0 -gt4min 0 -gt5min 0 -gt6min 0 -gt7min 0 -ukndiscount \
+    -text ${outdir}/aligned_lexicon.corpus -lm ${outdir}/aligned_lexicon.arpa
+fi
+
+if [ $stage -le 3 ]; then
+  # Convert the arpa file to FST.
+  phonetisaurus-arpa2wfst --lm=${outdir}/aligned_lexicon.arpa --ofile=${outdir}/model.fst
+fi
diff --git a/egs/material/s5/local/nnet3/run_ivector_common.sh b/egs/material/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..a56b3bf67d8
--- /dev/null
+++ b/egs/material/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train
+test_sets="dev"
+nj=30
+gmm=tri3
+
+nnet3_affix=
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj data/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang_test $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b1{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  # We'll use about a quarter of the data.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads 32 \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj $nj \
+     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+exit 0
diff --git a/egs/material/s5/local/normalize_numbers.py b/egs/material/s5/local/normalize_numbers.py
new file mode 100755
index 00000000000..b471cb853d4
--- /dev/null
+++ b/egs/material/s5/local/normalize_numbers.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+# Converts numbers to their text representations
+# Reads from stdin
+
+import os
+import sys
+__location__ = os.path.realpath(
+    os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+# Read translations of numbers into a dict
+num_trans = dict()
+with open(sys.argv[1]) as s_f:
+  for line in s_f:
+    line_comp = line.strip().split('\t')
+    num_trans[int(line_comp[0])] = line_comp[1]
+
+# Read input line by line and translate integers
+# Will only work for positive integers
+# Will not handle numbers which have a comma in them
+for line in sys.stdin:
+  words = line.strip().split()
+  for i in range(len(words)):
+    if words[i].isdigit() and int(words[i]) in num_trans:
+      words[i] = num_trans[int(words[i])]
+
+  sys.stdout.write(" ".join(words) + "\n")
diff --git a/egs/material/s5/local/parse_dev_transcripts.py b/egs/material/s5/local/parse_dev_transcripts.py
new file mode 100755
index 00000000000..730d27ec4f1
--- /dev/null
+++ b/egs/material/s5/local/parse_dev_transcripts.py
@@ -0,0 +1,195 @@
+#! /usr/bin/env python3
+
+import sys
+import os
+import re
+
+
+def normalize_text(text):
+    parts = text.strip().split()
+
+    for i, w in enumerate(parts):
+        if w in ["<no-speech>", "--", ".", "?", "~"]:
+            parts[i] = ""
+        elif w == "%incomplete":
+            parts[i] = "<unk>"
+        elif w in ["<cough>", "<laugh>", "<lipsmack>", "<hes>"]:
+            parts[i] = "<spnoise>"
+        elif w in ["<breath>", "<sta>"]:
+            parts[i] = "<noise>"
+        elif w in ["<int>", "(())", "<foreign>", "<overlap>", "<misc>"]:
+            parts[i] = "<unk>"
+
+        # change *word* into word
+        parts[i] = re.sub(r"^[*](\S+)[*]$", r"\1", parts[i])
+
+    return re.sub(r"\s+", " ", " ".join(parts))
+
+
+def write_segment(start_time, end_time, text, reco_id,
+                  segments_fh, utt2spk_fh, text_fh):
+    assert end_time > start_time
+
+    text = normalize_text(text)
+
+    utt_id = "{reco_id}-{st:06d}-{end:06d}".format(
+        reco_id=reco_id,
+        st=int(start_time * 100), end=int(end_time * 100))
+
+    print ("{utt_id} {reco_id} {st} {end}"
+           "".format(utt_id=utt_id, reco_id=reco_id,
+                     st=start_time, end=end_time),
+           file=segments_fh)
+    print ("{utt_id} {reco_id}"
+           "".format(utt_id=utt_id, reco_id=reco_id),
+           file=utt2spk_fh)
+    print ("{utt_id} {text}"
+           "".format(utt_id=utt_id, text=text),
+           file=text_fh)
+
+
+def parse_calls_transcript_file(transcript_file, segments_fh,
+                                utt2spk_fh, text_fh):
+    base_name = os.path.basename(transcript_file)
+    file_id = re.sub(".transcription.txt", "", base_name)
+
+    inline_start_time = -1
+    outline_start_time = -1
+
+    i = 0
+
+    for line in open(transcript_file):
+        parts = line.strip().split()
+
+        if i == 0 and not parts[0].startswith('0'):
+            raise Exception("Transcript file {0} does not start with 0.000"
+                            "".format(transcript_file))
+        i += 1
+
+        start_time = float(parts[0])
+        if len(parts) == 1:
+            # Last line in the file
+            write_segment(inline_start_time, start_time, inline_text, file_id + "_inLine",
+                          segments_fh, utt2spk_fh, text_fh)
+            write_segment(outline_start_time, start_time, outline_text, file_id + "_outLine",
+                          segments_fh, utt2spk_fh, text_fh)
+            break
+
+        assert parts[1] in ["inLine", "outLine"]
+
+        if parts[1] == "inLine":
+            reco_id = file_id + "_inLine"
+            if inline_start_time >= 0:
+                write_segment(inline_start_time, start_time, inline_text, reco_id,
+                              segments_fh, utt2spk_fh, text_fh)
+            inline_text = " ".join(parts[2:])
+            inline_start_time = start_time
+        else:
+            reco_id = file_id + "_outLine"
+            if outline_start_time >= 0:
+                write_segment(outline_start_time, start_time, outline_text, reco_id,
+                              segments_fh, utt2spk_fh, text_fh)
+            outline_text = " ".join(parts[2:])
+            outline_start_time = start_time
+
+
+def parse_non_calls_transcript_file(transcript_file, segments_fh,
+                                    utt2spk_fh, text_fh):
+    base_name = os.path.basename(transcript_file)
+    file_id = re.sub(".transcription.txt", "", base_name)
+
+    start_time = -1
+    i = 0
+
+    with open(transcript_file) as fh:
+        line = fh.readline().strip()
+        if not line.startswith('['):
+            raise Exception("Transcript file {0} does not start with [0.000"
+                            "".format(transcript_file))
+        try:
+            start_time  = float(re.sub(r"\[([^\]]+)\]", r"\1", line))
+        except Exception:
+            print("Could not parse line {0}".format(line), file=sys.stderr)
+            raise
+
+        text = fh.readline()
+        while text != '':
+            text = text.strip()
+            line = fh.readline().strip()
+            if not line.startswith('['):
+                raise Exception("Time-stamp in transcript file {0} does not start with [; error parsing line {1} after text {2}"
+                                "".format(transcript_file, line, text))
+            try:
+                end_time  = float(re.sub(r"\[([^\]]+)\]", r"\1", line))
+            except Exception:
+                print("Could not parse line {0}".format(line), file=sys.stderr)
+                raise
+
+            write_segment(start_time, end_time, text, file_id,
+                          segments_fh, utt2spk_fh, text_fh)
+            start_time = end_time
+            text = fh.readline()
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 5:
+        print ("Usage: {0} <corpus-root-dir> <calls-list> <non-calls-list> <data-dir>",
+               file=sys.stderr)
+        raise SystemExit(1)
+
+    root_path = sys.argv[1]
+    calls_list = open(sys.argv[2]).readlines()
+    non_calls_list = open(sys.argv[3]).readlines()
+    data_dir = sys.argv[4]
+
+    wav_scp_fh = open("{0}/wav.scp".format(data_dir), 'w')
+    utt2spk_fh = open("{0}/utt2spk".format(data_dir), 'w')
+    reco2file_and_channel_fh = open(
+        "{0}/reco2file_and_channel".format(data_dir), 'w')
+    text_fh = open("{0}/text".format(data_dir), 'w')
+    segments_fh = open("{0}/segments".format(data_dir), 'w')
+
+    for line in calls_list:
+        file_id = line.strip()
+        transcript_file = (
+            "{root_path}/transcription/{file_id}.transcription.txt"
+            "".format(root_path=root_path, file_id=file_id))
+        wav_file = "{root_path}/src/{file_id}.wav".format(
+            root_path=root_path, file_id=file_id)
+
+        for channel in [1, 2]:
+            reco_id = file_id + ("_inLine" if channel == 1 else "_outLine")
+            print ("{reco_id} {file_id} {channel}"
+                   "".format(reco_id=reco_id, file_id=file_id,
+                             channel="A" if channel == 1 else "B"),
+                   file=reco2file_and_channel_fh)
+            print ("{reco_id} sox {wav_file} -r 8000 -b 16 -c 1 -t wav - remix {channel} |"
+                   "".format(reco_id=reco_id, wav_file=wav_file, channel=channel),
+                   file=wav_scp_fh)
+
+        parse_calls_transcript_file(transcript_file, segments_fh,
+                                    utt2spk_fh, text_fh)
+
+    for line in non_calls_list:
+        file_id = line.strip()
+        transcript_file = (
+            "{root_path}/transcription/{file_id}.transcription.txt"
+            "".format(root_path=root_path, file_id=file_id))
+        wav_file = "{root_path}/src/{file_id}.wav".format(
+            root_path=root_path, file_id=file_id)
+
+        print ("{file_id} {file_id} 1"
+               "".format(file_id=file_id),
+               file=reco2file_and_channel_fh)
+        print ("{reco_id} sox {wav_file} -r 8000 -b 16 -c 1 -t wav - |"
+               "".format(reco_id=file_id, wav_file=wav_file),
+               file=wav_scp_fh)
+
+        parse_non_calls_transcript_file(transcript_file, segments_fh,
+                                        utt2spk_fh, text_fh)
+
+    wav_scp_fh.close()
+    utt2spk_fh.close()
+    reco2file_and_channel_fh.close()
+    text_fh.close()
+    segments_fh.close()
diff --git a/egs/material/s5/local/parse_transcripts.pl b/egs/material/s5/local/parse_transcripts.pl
new file mode 100755
index 00000000000..06c18a30c6c
--- /dev/null
+++ b/egs/material/s5/local/parse_transcripts.pl
@@ -0,0 +1,53 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+binmode STDIN, "utf8";
+binmode STDOUT, "utf8";
+binmode STDERR, "utf8";
+
+my $file = $ARGV[0];
+
+open(my $transcript, "<:utf8", $file) or
+  die "Cannot open file $file: $!\n";
+
+(my $basename = $file) =~ s/(.*\/)?([^\/]+)/$2/g;
+
+my $sentence = undef;
+my $begin_time = undef;
+my $end_time = undef;
+while(<$transcript>) {
+  chomp;
+  if (/^\[([0-9.]+)\]$/) {
+    $begin_time = $end_time;
+    $end_time = $1;
+    if ($sentence) {
+      print "$basename\t$begin_time\t$end_time\t$sentence\n";
+      $sentence = undef;
+    }
+  } else {
+    die "Invalid format of the transcription in $basename\n" if defined($sentence);
+    $sentence = $_;
+  }
+}
+
+die "Invalid format of the transcription in $basename\n" if defined($sentence);
+
diff --git a/egs/material/s5/local/postprocess_test.sh b/egs/material/s5/local/postprocess_test.sh
new file mode 100755
index 00000000000..950c1191d4d
--- /dev/null
+++ b/egs/material/s5/local/postprocess_test.sh
@@ -0,0 +1,56 @@
+#!/bin/sh
+set -euo pipefail
+echo "$0 $@"
+
+stage=0
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <data-id> <graph-dir> <decode-dir>"
+  echo " e.g.: $0 analysis1 exp/chain/tdnn/graph exp/chain/tdnn/decode_analysis1_segmented"
+  exit 1
+fi
+
+data=$1
+graph_dir=$2
+decode_dir=$3
+
+# get recording-level CTMs from the lattice by resolving the overlapping
+# regions
+
+if [ $stage -le 0 ]; then
+  steps/get_ctm_fast.sh --cmd "$decode_cmd" --frame-shift 0.03 \
+    data/${data}_hires/ ${graph_dir} \
+    ${decode_dir} ${decode_dir}/score_10_0.0
+fi
+
+if [ $stage -le 1 ]; then
+  utils/ctm/resolve_ctm_overlaps.py data/${data}_hires/segments \
+    ${decode_dir}/score_10_0.0/ctm \
+    - | utils/convert_ctm.pl data/${data}_hires/segments data/${data}_hires/reco2file_and_channel > \
+    ${decode_dir}/score_10_0.0/${data}_hires.ctm
+fi
+
+if [ $stage -le 2 ]; then
+  # extract n-best lists from archive.* files
+  if [[ ${decode_dir} == *_rescore_nbest ]]; then
+    hyp_filtering_cmd="cat"
+    [ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
+    [ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
+    mkdir -p ${decode_dir}/output_nbest
+    for f in ${decode_dir}/archives.*; do
+      docid=$(head -1 $f/words_text | awk '{print $1}' | cut -f1,2 -d'-')
+      $hyp_filtering_cmd $f/words_text  > \
+        ${decode_dir}/output_nbest/$docid".n.txt" || exit 1;
+    done
+  fi
+
+  # compute WER              
+  local/score_stm.sh --min-lmwt 10 --max-lmwt 10 --word-ins-penalty 0.0 \
+    --cmd "$decode_cmd" data/${data}_hires $graph_dir ${decode_dir}
+
+  grep -H Sum ${decode_dir}/score*/*.sys | utils/best_wer.sh
+fi
diff --git a/egs/material/s5/local/prepare_audio_data.sh b/egs/material/s5/local/prepare_audio_data.sh
new file mode 100755
index 00000000000..2bf9283f435
--- /dev/null
+++ b/egs/material/s5/local/prepare_audio_data.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+echo "$0 " "$@"
+
+if [ $# -ne 1 ] ; then
+  echo "Invalid number of script parameters. "
+  echo "  $0 <path-to-material-corpus>"
+  echo "e.g."
+  echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
+  exit
+fi
+data=$1
+
+conversational_train=$data/conversational/training/
+audio=$conversational_train/audio/
+[ ! -d $audio ] && \
+  echo "The directory $audio does not exist!" && exit 1
+
+find $audio -type f \( -name "*.wav" -o -name "*.sph" \) | \
+  local/audio2wav_scp.pl > data/train/wav.scp
+
+
+conversational_dev=$data/conversational/dev
+audio=$conversational_dev/audio/
+[ ! -d $audio ] && \
+  echo "The directory $audio does not exist!" && exit 1
+
+find $audio -type f \( -name "*.wav" -o -name "*.sph" \) | \
+  local/audio2wav_scp.pl > data/dev/wav.scp
+
diff --git a/egs/material/s5/local/prepare_dict.sh b/egs/material/s5/local/prepare_dict.sh
new file mode 100755
index 00000000000..710f1a66e2e
--- /dev/null
+++ b/egs/material/s5/local/prepare_dict.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+echo "$0 " "$@"
+
+language=swahili
+
+. ./utils/parse_options.sh
+
+if [ $# -ne 1 ] ; then
+  echo "Invalid number of script parameters. "
+  echo "  $0 [options] <path-to-material-corpus>"
+  echo "e.g."
+  echo "  $0 --language swahili /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
+  exit
+fi
+data=$1
+
+lexicon=$data/conversational/reference_materials/lexicon.txt
+
+mkdir -p data/local
+cat $lexicon | awk '{print $1}' > data/local/lexicon_words
+cat $lexicon | cut -f2-  > data/local/lexicon_phns
+
+if [ "$language" == "swahili" ]; then
+  language_affix="sw"
+elif [ "$language" == "tagalog" ]; then
+  language_affix="tl"
+elif [ "$language" == "somali" ]; then
+  language_affix="so"
+fi
+MOSES=/home/pkoehn/moses
+SOURCE_TC_MODEL=/home/pkoehn/experiment/material-asr-${language_affix}-en/truecaser/truecase-model.1.${language_affix}
+  $MOSES/scripts/recaser/truecase.perl -model $SOURCE_TC_MODEL \
+    < data/local/lexicon_words > data/local/lexicon_words_tc
+
+paste data/local/lexicon_words_tc data/local/lexicon_phns | sort > data/local/lexicon_tc
+
+lexicon=data/local/lexicon_tc
+
+[ ! -f $lexicon ] && echo "Lexicon $lexicon does not exist!" && exit 1;
+echo $0: using lexicon $lexicon
+mkdir -p data/local/dict_nosp/
+cat data/train/text | cut -f 2- -d ' ' | \
+  sed 's/ /\n/g' | grep . | sort -u > data/local/dict_nosp/wordlist
+
+local/convert_lexicon.pl <(echo -e "<unk>\t<unk>\n<sil>\t<sil>\n<noise>\t<noise>\n<spnoise>\t<spnoise>" | cat - $lexicon ) data/local/dict_nosp/wordlist | sort -u > data/local/dict_nosp/lexicon.txt
+[ -f  data/local/dict_nosp/lexiconp.txt ] && rm data/local/dict_nosp/lexiconp.txt
+
+cat data/local/dict_nosp/lexicon.txt | sed 's/\t/ /g' | \
+  cut -f 2- -d ' ' | sed 's/ /\n/g' | grep . | sort -u > data/local/dict_nosp/phones.txt
+
+
+grep "^<.*>$" data/local/dict_nosp/phones.txt  > data/local/dict_nosp/silence_phones.txt
+grep -v "^<.*>$" data/local/dict_nosp/phones.txt  > data/local/dict_nosp/nonsilence_phones.txt
+echo "<sil>" > data/local/dict_nosp/optional_silence.txt
+echo "<unk>" > data/local/dict_nosp/oov.txt
+
+
+
+utils/validate_dict_dir.pl data/local/dict_nosp/
+
diff --git a/egs/material/s5/local/prepare_text_data.sh b/egs/material/s5/local/prepare_text_data.sh
new file mode 100755
index 00000000000..4200a55ed9d
--- /dev/null
+++ b/egs/material/s5/local/prepare_text_data.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+echo "$0 " "$@"
+
+language=swahili
+
+. ./utils/parse_options.sh
+
+if [ $# -ne 1 ] ; then
+  echo "Invalid number of script parameters. "
+  echo "  $0 [options] <path-to-material-corpus>"
+  echo "e.g."
+  echo "  $0 --language swahili /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
+  exit
+fi
+data=$1;
+conversational_train=$data/conversational/training/
+mkdir -p data/train/
+for file in $conversational_train/transcription/*txt ; do
+  ./local/parse_transcripts.pl $file
+done  > data/train/transcripts.txt
+
+
+conversational_dev=$data/conversational/dev/
+mkdir -p data/dev
+for file in $conversational_dev/transcription/*txt ; do
+  ./local/parse_transcripts.pl $file
+done > data/dev/transcripts.txt
+
+
+cat data/train/transcripts.txt | \
+  local/cleanup_transcripts.pl | \
+  local/create_datafiles.pl data/train/
+
+cat data/dev/transcripts.txt | \
+  local/cleanup_transcripts.pl | \
+  local/create_datafiles.pl data/dev/
+
+if [ "$language" == "swahili" ]; then
+  language_affix="sw"
+elif [ "$language" == "tagalog" ]; then
+  language_affix="tl"
+elif [ "$language" == "somali" ]; then
+  language_affix="so"
+fi
+MOSES=/home/pkoehn/moses
+SOURCE_TC_MODEL=/home/pkoehn/experiment/material-asr-${language_affix}-en/truecaser/truecase-model.1.${language_affix}
+
+for i in train dev; do
+  cat data/$i/text | cut -d " " -f2- > data/$i/text.notruecase
+  cat data/$i/text | cut -d " " -f1  > data/$i/uttids
+  # Truecase
+  $MOSES/scripts/recaser/truecase.perl -model $SOURCE_TC_MODEL \
+    < data/$i/text.notruecase | sed "s=<= <=g" > data/$i/text.truecase
+#  cat data/$i/text.truecase | sed 's/&apos; //g' | sed 's/&apos//g' | sed 's/&#91//g' | sed 's/&#93//g' | sed 's/&quot; //g' | sed 's/&quot //g' | sed 's/&amp; //g' | sed 's/@-@ //g' | sed 's/://g' | sed 's/\///g' | sed 's/%//g' | sed 's/+//g' | sed 's/( //g' | sed 's/) //g' | sed 's/\, //g' | sed 's/ \.//g' | sed 's/\?//g' | sed 's/\!//g' | sed 's/\;//g' > data/$i/text.nopunc
+  cat data/$i/text.truecase | tr 'A-Z' 'a-z' > data/$i/text.nopunc
+  paste -d " " data/$i/uttids data/$i/text.nopunc > data/$i/text
+done
+
+
diff --git a/egs/material/s5/local/preprocess_external_text.sh b/egs/material/s5/local/preprocess_external_text.sh
new file mode 100755
index 00000000000..4cbc457310e
--- /dev/null
+++ b/egs/material/s5/local/preprocess_external_text.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -euo pipefail
+set -e -o pipefail                                                              
+set -o nounset                              # Treat unset variables as an error 
+echo "$0 $@"
+
+language=swahili
+srctext_bitext=data/bitext/text
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+output=$1
+
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+if [ "$language" == "swahili" ]; then
+  language_affix="sw"
+elif [ "$language" == "tagalog" ]; then
+  language_affix="tl"
+elif [ "$language" == "somali" ]; then
+  language_affix="so"
+fi
+MOSES=/home/pkoehn/moses
+
+# Normalize punctuation and tokenize input
+$MOSES/scripts/tokenizer/normalize-punctuation.perl ${language_affix} < ${srctext_bitext} \
+ | $MOSES/scripts/tokenizer/tokenizer.perl -a -l ${language_affix} > ${srctext_bitext}.tok
+
+# convert to lower cases
+cat ${srctext_bitext}.tok | tr 'A-Z' 'a-z' > ${srctext_bitext}.tc
+
+# Remove punctuation
+cat ${srctext_bitext}.tc | sed 's/&apos; //g' | sed 's/&apos//g' | sed 's/&#91//g' | sed 's/&#93//g' | sed 's/&quot; //g' | sed 's/&quot //g' | sed 's/&amp; //g' | sed 's/@-@ //g' | sed 's/-//g' | sed 's/://g' | sed 's/\///g' | sed 's/%//g' | sed 's/+//g' | sed 's/( //g' | sed 's/) //g' | sed 's/\, //g' | sed 's/ \.//g' | sed 's/\?//g' | sed 's/\!//g' | sed 's/\;//g' > $output
+
diff --git a/egs/material/s5/local/preprocess_test.sh b/egs/material/s5/local/preprocess_test.sh
new file mode 100755
index 00000000000..fbc868d3f7c
--- /dev/null
+++ b/egs/material/s5/local/preprocess_test.sh
@@ -0,0 +1,135 @@
+#!/bin/sh
+set -euo pipefail
+set -e -o pipefail                                                              
+set -o nounset                              # Treat unset variables as an error 
+echo "$0 $@"
+
+stage=0
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+. ./lang.conf
+
+datadev=$1
+
+mkdir -p $datadev
+
+# 1. create the reference transcript $datadev/reftext
+
+dataset=$(basename $datadev)
+
+audio_path=
+if [ $dataset == "analysis1" ]; then
+  audio_path=${audio_path_analysis1}
+elif [ $dataset == "analysis2" ]; then
+  audio_path=${audio_path_analysis2}
+elif [ $(basename $datadev) == 'test_dev' ]; then
+  audio_path=${audio_path_dev}
+elif [ $(basename $datadev) == 'eval1' ]; then
+  audio_path=${audio_path_eval1}
+elif [ $(basename $datadev) == 'eval2' ]; then
+  audio_path=${audio_path_eval2}
+elif [ $(basename $datadev) == 'eval3' ]; then
+  audio_path=${audio_path_eval3}
+fi
+
+[ -z ${audio_path} ] && echo "$0: test data should be either analysis1, analysis2, test_dev, eval1 or eval2." && exit 1
+
+metadata_file=${audio_path}/metadata/metadata.tsv
+
+if [ $stage -le 0 ]; then
+  mkdir -p data/local/$dataset
+
+  tail -n +2 $metadata_file | \
+    perl -ane '$F[0] =~ s/.wav//; print "$F[0] $F[1]\n";' > \
+    data/local/$dataset/all_list
+
+  awk '{if ($2 == "CS") { print $1 } }' data/local/$dataset/all_list > data/local/$dataset/call_list
+  awk '{if ($2 != "CS") { print $1 } }' data/local/$dataset/all_list > data/local/$dataset/non_call_list
+fi
+
+if [ $stage -le 2 ]; then
+  rm data/local/$dataset/{wav.scp,reco2file_and_channel} 2>/dev/null || true
+
+  if [ $dataset == "analysis1" ] || [ $dataset == "analysis2" ]; then
+    local/parse_dev_transcripts.py $audio_path \
+      data/local/$dataset/call_list \
+      data/local/$dataset/non_call_list \
+      data/local/$dataset
+  else
+    for f in $(cat data/local/$dataset/call_list); do
+      wav_file="$audio_path/src/$f.wav"
+
+      echo "${f}_inLine sox $wav_file -r 8000 -b 16 -c 1 -t wav - remix 1 |" >> data/local/$dataset/wav.scp
+      echo "${f}_outLine sox $wav_file -r 8000 -b 16 -c 1 -t wav - remix 2 |" >> data/local/$dataset/wav.scp
+      echo "${f}_inLine ${f} A" >> data/local/$dataset/reco2file_and_channel
+      echo "${f}_outLine ${f} B" >> data/local/$dataset/reco2file_and_channel
+    done
+    
+    for f in $(cat data/local/$dataset/non_call_list); do
+      wav_file="$audio_path/src/$f.wav"
+
+      echo "${f} sox $wav_file -r 8000 -b 16 -c 1 -t wav - |" >> data/local/$dataset/wav.scp
+      echo "${f} ${f} 1" >> data/local/$dataset/reco2file_and_channel
+    done
+
+    awk '{print $1" "$1}' data/local/$dataset/wav.scp > data/local/$dataset/utt2spk
+  fi
+  utils/utt2spk_to_spk2utt.pl data/local/$dataset/utt2spk > data/local/$dataset/spk2utt
+  utils/fix_data_dir.sh data/local/$dataset
+  
+  utils/copy_data_dir.sh data/local/$dataset $datadev
+fi
+
+if [ $stage -le 3 ]; then
+  if [ $dataset == "analysis1" ] || [ $dataset == "analysis2" ]; then
+    cat data/local/$dataset/all_list | awk '{print $1" <"$2",O>"}' > \
+      data/local/$dataset/all_list_labels
+    
+    awk '{print $2" "$1" "$3" "$4" "$1}' $datadev/segments | \
+      utils/apply_map.pl -f 1 $datadev/reco2file_and_channel | \
+      utils/apply_map.pl -f 3 $datadev/utt2spk | \
+      awk '{print $1" "$2" "$3" "$4" "$5" "$1" "$6}' | \
+      utils/apply_map.pl -f 7 $datadev/text | \
+      utils/apply_map.pl -f 6 data/local/$dataset/all_list_labels | \
+      sort +0 -1 +1 -2 +3nb -4 > \
+      $datadev/stm
+
+    touch $datadev/glm
+  fi
+fi
+
+# 3. segment .wav files
+ 
+# 3.1. create a trivial segments file:
+
+if [ $stage -le 4 ]; then
+  utils/data/get_utt2dur.sh --nj 4 --cmd "$train_cmd" ${datadev}
+
+  if [ ! -f $datadev/segments ]; then
+    utils/data/get_segments_for_data.sh $datadev/ > $datadev/segments
+  fi
+
+  # 3.2. create uniform segmented directory using: (The durations are in seconds)
+
+  if [ $dataset == "analysis1" ] || [ $dataset == "analysis2" ]; then
+    utils/data/convert_data_dir_to_whole.sh $datadev ${datadev}_whole
+    utils/data/get_utt2dur.sh --nj 4 --cmd "$train_cmd" ${datadev}_whole
+    
+    utils/data/get_segments_for_data.sh ${datadev}_whole > ${datadev}_whole/segments
+    utils/data/get_uniform_subsegments.py --max-segment-duration=30 \
+    --overlap-duration=5 --max-remaining-duration=15 ${datadev}_whole/segments > \
+    ${datadev}_whole/uniform_sub_segments
+
+    utils/data/subsegment_data_dir.sh ${datadev}_whole/ \
+      ${datadev}_whole/uniform_sub_segments ${datadev}_segmented
+  else
+    utils/data/get_uniform_subsegments.py --max-segment-duration=30 \
+    --overlap-duration=5 --max-remaining-duration=15 ${datadev}/segments > \
+    ${datadev}/uniform_sub_segments
+
+    utils/data/subsegment_data_dir.sh ${datadev}/ \
+      ${datadev}/uniform_sub_segments ${datadev}_segmented
+  fi
+fi
diff --git a/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
new file mode 100755
index 00000000000..3f5c7e547b1
--- /dev/null
+++ b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
@@ -0,0 +1,217 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#                2017  Hainan Xu
+#                2018  Ke Li
+#                2018  Yiming Wang
+
+
+# [for swahili]
+# rnnlm/train_rnnlm.sh: best iteration (out of 40) was 38, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 140.6 / 1019.4.
+# Train objf: -6.28 -5.90 -5.70 -5.56 -5.47 -5.40 -5.34 -5.29 -5.25 -5.22 -5.17 -5.16 -5.13 -5.10 -5.07 -5.06 -5.04 -5.01 -4.99 -4.98 -4.97 -4.96 -4.93 -4.93 -4.91 -4.91 -4.89 -4.88 -4.87 -4.86 -4.84 -4.85 -4.81 -4.79 -4.78 -4.76 -4.75 -4.74 -4.73
+# Dev objf:   -8.69 -7.76 -7.31 -7.03 -6.98 -7.00 -6.96 -6.96 -6.93 -6.94
+
+# %WER 36.75 [ 22836 / 62144, 2758 ins, 6307 del, 13771 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_10_0.0
+# %WER 38.91 [ 24181 / 62144, 2750 ins, 6579 del, 14852 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore_nbest/wer_10_0.0
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys 
+# | Sum/Avg                            |  9906   59164  | 62.2     23.8    14.0     3.5     41.3    49.1  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis1_segmented_hires.ctm.sys 
+# |  Sum/Avg                            |  9906    59164  |  61.9     23.6     14.6      3.2     41.4     49.5  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys 
+# | Sum/Avg                            |  5322   37120  | 66.2     21.2    12.6     2.9     36.8    49.8  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis2_segmented_hires.ctm.sys 
+# |  Sum/Avg                            |  5322    37120  |  65.8     21.1     13.1      2.7     36.9     49.9  |
+
+# [for tagalog]
+# rnnlm/train_rnnlm.sh: best iteration (out of 320) was 125, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 141.2 / 259.6.
+# Train objf: -6.08 -5.78 -5.62 -5.52 -5.45 -5.40 -5.36 -5.32 -5.28 -5.26 -5.23 -5.20 -5.18 -5.16 -5.14 -5.13 -5.11 -5.10 -5.09 -5.07 -5.06 -5.05 -5.03 -5.03 -5.02 -5.01 -5.00 -4.99 -4.99 -4.97 -4.97 -4.97 -4.96 -4.94 -4.94 -4.93 -4.93 -4.92 -4.91 -4.92 -4.91 -4.90 -4.89 -4.89 -4.89 -4.88 -4.87 -4.87 -4.87 -4.86 -4.86 -4.85 -4.85 -4.84 -4.84 -4.84 -4.83 -4.83 -4.83 -4.82 -4.82 -4.82 -4.81 -4.82 -4.81 -4.81 -4.80 -4.79 -4.79 -4.79 -4.79 -4.80 -4.79 -4.79 -4.79 -4.80 -4.79 -4.78 -4.78 -4.79 -4.77 -4.79 -4.79 -4.78 -4.78 -4.78 -4.78 -4.78 -4.77 -4.77 -4.79 -4.79 -4.78 -4.78 -4.78 -4.78 -4.78 -4.79 -4.78 -4.80 -4.79 -4.78 -4.79 -4.80 -4.80 -4.79 -4.79 -4.77 -4.78 -4.77 -4.77 -4.78 -4.75 -4.80 -4.78 -4.77 -4.76 -4.77 -4.76 -4.76 -4.75 -4.75 -4.76 -4.76 -4.77 -4.75 -4.75 -4.75 -4.76 -4.75 -4.76 -4.74 -4.75 -4.75 -4.76 -4.75 -4.75 -4.75 -4.74 -4.76 -4.75 -4.74 -4.78 -4.74 -4.73 -4.77 -4.76 -4.75 -4.74 -4.73 -4.73 -4.75 -4.75 -4.74 -4.76 -4.73 -4.72 -4.76 -4.72 -4.72 -4.73 -4.72 -4.73 -4.75 -4.72 -4.73 -4.76 -4.75 -4.72 -4.72 -4.74 -4.75 -4.73 -4.72 -4.74 -4.74 -4.73 -4.74 -4.74 -4.74 -4.72 -4.70 -4.72 -4.75 -4.74 -4.75 -4.74 -4.76 -4.72 -4.72 -4.74 -4.75 -4.71 -4.74 -4.73 -4.73 -4.73 -4.73 -4.74 -4.75 -4.73 -4.73 -4.72 -4.71 -4.72 -4.71 -4.72 -4.75 -4.72 -4.71 -4.74 -4.71 -4.70 -4.73 -4.73 -4.75 -4.75 -4.72 -4.72 -4.73 -4.75 -4.73 -4.72 -4.72 -4.72 -4.73 -4.76 -4.73 -4.76 -4.74 -4.73 -4.74 -4.74 -4.74 -4.73 -4.73 -4.73 -4.70 -4.73 -4.74 -4.72 -4.73 -4.73 -4.75 -4.72 -4.73 -4.73 -4.75 -4.73 -4.75 -4.75 -4.73 -4.75 -4.74 -4.75 -4.77 -4.74 -4.75 -4.74 -4.73 -4.77 -4.75 -4.74 -4.75 -4.74 -4.77 -4.76 -4.75 -4.79 -4.78 -4.76 -4.76 -4.77 -4.76 -4.75 -4.74 -4.74 -4.78 -4.77 -4.77 -4.78 -4.79 -4.79 -4.79 -4.76 -4.77 -4.76 -4.79 -4.76 -4.77 -4.76 -4.78 -4.80 -4.79 -4.78 -4.82 -4.82 -4.79 -4.80 -4.81 -4.79 -4.77 -4.79 -4.82 -4.81 -4.82 -4.83 -4.85 -4.84 -4.83 -4.85 -4.88 -4.85 -4.87 -4.86 -4.84 -4.87 -4.85 -4.84 
+# Dev objf:   -8.70 -7.03 -60340.00 -6.61 -6.45 -6.54 -60340.00 -6.34 -60340.00 -60340.00 -6.15 -6.12 -6.03 -6.03 -60340.00 -60340.00 -6.64 -60340.00 -6.01 -5.91 -5.93 -6.06 -5.92 -5.95 -6.00 -6.17 -6.06 -5.92 -5.92 -60340.00 -6.03 -5.93 -5.98 -60340.00 -6.00 -5.90 -5.84 -6.00 -60340.00 -5.95 -5.89 -60340.00 -5.90 -6.14 -5.84 -5.92 -5.83 -5.86 -5.89 -5.84 -60340.00 -5.90 -5.80 -5.87 -5.87 -60340.00 -5.79 -60340.00 -60340.00 -60340.00 -6.56 -5.88 -5.94 -60340.00 -5.84 -60340.00 -5.84 -5.81 -5.77 -60340.00 -60340.00 -60340.00 -5.81 -5.90 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.72 -5.79 -60340.00 -60340.00 -60340.00 -60340.00 -5.72 -5.80 -60340.00 -60340.00 -5.68 -5.73 -5.74 -60340.00 -5.67 -5.63 -60340.00 -5.75 -60340.00 -5.66 -5.71 -5.73 -5.73 -5.75 -60340.00 -5.77 -60340.00 -5.70 -5.70 -5.82 -60340.00 -60340.00 -5.77 -5.72 -5.75 -60340.00 -5.56 -60340.00 -5.73 -60340.00 -60340.00 -5.99 -5.77 -60340.00 -5.65 -5.80 -60340.00 -60340.00 -5.64 -5.67 -5.73 -5.59 -60340.00 -60340.00 -5.73 -60340.00 -60340.00 -5.83 -5.58 -5.64 -5.75 -60340.00 -5.77 -5.68 -60340.00 -60340.00 -5.70 -5.85 -60340.00 -60340.00 -5.82 -6.15 -5.74 -5.73 -5.75 -60340.00 -60340.00 -5.86 -60340.00 -5.80 -5.79 -5.81 -60340.00 -5.89 -60340.00 -5.81 -5.71 -60340.00 -60340.00 -5.65 -5.87 -60340.00 -60340.00 -60340.00 -5.83 -60340.00 -5.94 -5.74 -5.75 -5.75 -60340.00 -5.76 -5.73 -5.76 -60340.00 -60340.00 -5.85 -5.91 -5.98 -60340.00 -5.88 -5.86 -60340.00 -60340.00 -60340.00 -60340.00 -5.91 -5.81 -5.86 -60340.00 -6.10 -6.17 -60340.00 -60340.00 -5.82 -5.82 -60340.00 -60340.00 -6.78 -5.71 -5.87 -60340.00 -60340.00 -5.98 -5.94 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.81 -60340.00 -60340.00 -60340.00 -5.74 -60340.00 -5.83 -60340.00 -5.96 -5.80 -60340.00 -60340.00 -60340.00 -5.82 -60340.00 -60340.00 -60340.00 -60340.00 -5.80 -60340.00 -60340.00 -60340.00 -60340.00 -5.79 -60340.00 -6.13 -5.97 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.97 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.98 -60340.00 -60340.00 -60340.00 -5.85 -5.92 -5.85 -5.82 -6.04 -60340.00 -60340.00 -60340.00 -60340.00 -5.93 -60340.00 -5.85 -5.87 -5.77 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.89 -60340.00 -60340.00 -60340.00 -60340.00 -6.18 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.92 -6.01
+
+# %WER 46.07 [ 29664 / 64382, 3133 ins, 9896 del, 16635 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_10_0.5
+# %WER 47.47 [ 30563 / 64382, 3568 ins, 8934 del, 18061 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore_nbest/wer_10_0.5
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys 
+# | Sum/Avg                            | 10551   87329  | 53.7     25.3    21.0     4.6     51.0    65.6  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis1_segmented_hires.ctm.sys 
+# |  Sum/Avg                            | 10551    87329  |  53.4     24.9     21.6      4.3     50.9     65.6  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys 
+# | Sum/Avg                            |  5933   56887  | 52.6     25.0    22.4     4.9     52.3    73.8  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis2_segmented_hires.ctm.sys 
+# |  Sum/Avg                            |  5933    56887  |  52.3     24.5     23.1      4.5     52.2     73.9  |
+
+# [for somali]
+# rnnlm/train_rnnlm.sh: best iteration (out of 800) was 133, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 414.5 / 860.9.
+
+# %WER 56.54 [ 46160 / 81637, 4654 ins, 13070 del, 28436 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_10_0.0
+# %WER 57.85 [ 47226 / 81637, 5002 ins, 12287 del, 29937 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore_nbest/wer_10_0.0
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys 
+# | Sum/Avg                            |  9852   90609  | 50.4     33.3    16.3     8.2     57.8    74.8  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis1_segmented_hires.ctm.sys 
+# |  Sum/Avg                            |  9852    90609  |  50.4     33.2     16.4      8.1     57.7     74.9  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys 
+# | Sum/Avg                            |  8275   67640  | 53.0     32.8    14.2     8.5     55.5    69.3  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis2_segmented_hires.ctm.sys 
+# |  Sum/Avg                            |  8275    67640  |  53.0     32.7     14.3      8.3     55.3     69.2  |
+
+
+# Begin configuration section.
+
+embedding_dim=512
+lstm_rpd=128
+lstm_nrpd=128
+stage=0
+train_stage=-10
+epochs=40
+
+# variables for lattice rescoring
+run_rescore=true
+decode_dir_suffix=rnnlm
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
+
+ac_model_dir=exp/chain/tdnn1b_sp
+decode_sets="dev analysis1_segmented analysis2_segmented test_dev_segmented eval1_segmented eval2_segmented eval3_segmented"
+
+dir=exp/rnnlm_lstm_1a
+text_dir=data/rnnlm/text
+train_text=data/lm/train.txt
+dev_text=data/lm/dev.txt
+bitext=data/bitext/text.txt
+monotext=data/mono/text.txt
+
+lang=data/lang_combined_chain
+tree_dir=exp/chain/tree_sp
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+
+mkdir -p $dir/config
+set -e
+
+for f in ${train_text} ${dev_text} $bitext $monotext; do
+
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; look at stage 12 in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  cat $train_text > $text_dir/train.txt
+  cat $dev_text > $text_dir/dev.txt
+  cat $bitext > $text_dir/bitext.txt
+  cat $monotext > $text_dir/monotext.txt
+
+fi
+
+if [ $stage -le 1 ]; then
+  cp $lang/words.txt $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+train   1   1.0
+bitext  1   1.0
+monotext  1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<brk>,<unk>,<noise>,<spnoise>,<sil>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-2))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-1))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 1 --embedding-l2 0.001 \
+                  --stage $train_stage --num-epochs $epochs --cmd "$train_cmd" $dir
+fi
+
+LM=combined_chain
+if [ $stage -le 4 ] && $run_rescore; then
+  echo "$0: Perform lattice-rescoring on $ac_model_dir"
+  pruned=
+  if $pruned_rescore; then
+    pruned=_pruned
+  fi
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore$pruned.sh \
+        --cmd "$decode_cmd" \
+        --weight 0.5 --max-ngram-order $ngram_order --max-arcs 20000 \
+        --skip-scoring ${skip_scoring} \
+        data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir} ${decode_dir}_${decode_dir_suffix}_rescore || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore
+      fi
+    ) || touch $dir/.error &
+  done
+fi
+wait
+#[ -f $dir/.error ] && echo "$0: there was a problem while rescoring" && exit 1
+
+if [ $stage -le 5 ]; then
+  echo "$0: Perform nbest-rescoring on $ac_model_dir"
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore_nbest.sh \
+        --N 20 \
+        --cmd "$decode_cmd" \
+        --skip-scoring ${skip_scoring} \
+        0.5 data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir}_${decode_dir_suffix}_rescore ${decode_dir}_${decode_dir_suffix}_rescore_nbest || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore_nbest
+      fi
+    ) || touch $dir/.error 
+  done
+fi
+
+exit 0
diff --git a/egs/material/s5/local/rnnlm/run_tdnn_lstm_2.sh b/egs/material/s5/local/rnnlm/run_tdnn_lstm_2.sh
new file mode 100755
index 00000000000..13cf0bde44c
--- /dev/null
+++ b/egs/material/s5/local/rnnlm/run_tdnn_lstm_2.sh
@@ -0,0 +1,181 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#                2017  Hainan Xu
+#                2018  Ke Li
+#                2018  Yiming Wang
+
+
+# [for swahili]
+# rnnlm/train_rnnlm.sh: best iteration (out of 10) was 5, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 59.1 / 273.1.
+# Train objf: -5.48 -4.75 -4.47 -4.30 -4.17 -4.06 -3.96 -3.87 -3.77 -3.68 
+# Dev objf:   -10.79 -6.00 -5.75 -5.69 -5.62 -5.61 -5.62 -5.66 -5.66
+
+# %WER 35.84 [ 22270 / 62144, 2573 ins, 6961 del, 12736 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_11_0.5
+# %WER 48.49 [ 28692 / 59166, 2310 ins, 9200 del, 17182 sub ] exp/chain/tdnn1b_sp/decode_analysis1_segmented_reseg_rnnlm_rescore
+
+# [for tagalog]
+# rnnlm/train_rnnlm.sh: best iteration (out of 10) was 4, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 73.6 / 106.2.
+# Train objf: -5.55 -4.83 -4.58 -4.41 -4.28 -4.17 -4.06 -3.96 -3.86
+# Dev objf:   -10.54 -4.87 -4.72 -4.67 -4.67 -4.69 -4.71 -4.74 -4.78
+
+# %WER 42.91 [ 27628 / 64382, 3624 ins, 8301 del, 15703 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_10_0.0
+# %WER 55.55 [ 48530 / 87362, 4030 ins, 19326 del, 25174 sub ] exp/chain/tdnn1b_sp/decode_analysis1_segmented_reseg_rnnlm_rescore
+
+# Begin configuration section.
+
+embedding_dim=512
+lstm_rpd=128
+lstm_nrpd=128
+stage=0
+train_stage=-10
+epochs=40
+
+# variables for lattice rescoring
+run_rescore=true
+decode_dir_suffix=rnnlm
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
+
+ac_model_dir=exp/chain/tdnn1b_sp
+#decode_sets="dev analysis1_segmented_reseg test_dev_segmented_reseg eval1_segmented_reseg eval2_segmented_reseg"
+decode_sets="dev analysis1_segmented test_dev_segmented eval1_segmented eval2_segmented eval3_segmented"
+decode_sets="analysis2_segmented"
+#decode_sets="dev eval1_segmented eval2_segmented"
+dir=exp/rnnlm_lstm_1a
+text_dir=data/rnnlm/text
+train_text=data/lm/train.txt
+dev_text=data/lm/dev.txt
+bitext=data/bitext/text.txt
+lang=data/lang_combined_chain
+tree_dir=exp/chain/tree_sp
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+
+mkdir -p $dir/config
+set -e
+
+for f in ${train_text} ${dev_text} $bitext; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; look at stage 12 in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  cat $train_text > $text_dir/train.txt
+  cat $dev_text > $text_dir/dev.txt
+  cat $bitext > $text_dir/bitext.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp $lang/words.txt $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+train   1   1.0
+bitext  1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<brk>,<unk>,<noise>,<spnoise>,<sil>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-2))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-1))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 1 --embedding-l2 0.001 \
+                  --stage $train_stage --num-epochs $epochs --cmd "$train_cmd" $dir
+fi
+
+LM=combined_chain
+if [ $stage -le 4 ] && $run_rescore; then
+  echo "$0: Perform lattice-rescoring on $ac_model_dir"
+  pruned=
+  if $pruned_rescore; then
+    pruned=_pruned
+  fi
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore$pruned.sh \
+        --cmd "$decode_cmd" \
+        --weight 0.5 --max-ngram-order $ngram_order --max-arcs 20000 \
+        --skip-scoring ${skip_scoring} \
+        data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir} ${decode_dir}_${decode_dir_suffix}_rescore || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore
+      fi
+    ) || touch $dir/.error &
+  done
+fi
+wait
+#[ -f $dir/.error ] && echo "$0: there was a problem while rescoring" && exit 1
+
+if [ $stage -le 5 ]; then
+  echo "$0: Perform nbest-rescoring on $ac_model_dir"
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore_nbest.sh \
+        --N 20 \
+        --cmd "$decode_cmd" \
+        --skip-scoring ${skip_scoring} \
+        0.5 data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir}_${decode_dir_suffix}_rescore ${decode_dir}_${decode_dir_suffix}_rescore_nbest || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore_nbest
+      fi
+    ) || touch $dir/.error 
+  done
+fi
+
+exit 0
diff --git a/egs/material/s5/local/score.sh b/egs/material/s5/local/score.sh
new file mode 100755
index 00000000000..c7da00fba32
--- /dev/null
+++ b/egs/material/s5/local/score.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+echo "$0" "$@"
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
+
diff --git a/egs/material/s5/local/score_segments.sh b/egs/material/s5/local/score_segments.sh
new file mode 100755
index 00000000000..064e15ae40d
--- /dev/null
+++ b/egs/material/s5/local/score_segments.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+echo "$0" "$@"
+local/score_wer_segments.sh "$@"
+#local/score_cer_segment.sh --stage 2 "$@"
+
diff --git a/egs/material/s5/local/score_stm.sh b/egs/material/s5/local/score_stm.sh
new file mode 100755
index 00000000000..7e1236ce92e
--- /dev/null
+++ b/egs/material/s5/local/score_stm.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+# Copyright 2013  Johns Hopkins University (authors: Yenda Trmal)
+#           2018  Vimal Manohar
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This scoring script is copied from Babel and modified.
+# This is a scoring script for the CTMS in <decode-dir>/score_<LMWT>/${name}.ctm
+# it tries to mimic the NIST scoring setup as much as possible (and usually does a good job)
+
+# begin configuration section.
+cmd=run.pl
+cer=0
+min_lmwt=7
+max_lmwt=17
+model=
+stage=0
+ctm_name=
+word_ins_penalty=0.0,0.5,1.0
+case_insensitive=true
+use_icu=true
+icu_transform='Any-Lower'
+#end configuration section.
+
+echo $0 $@
+
+[ -f ./path.sh ] && . ./path.sh
+[ -f ./cmd.sh ]  && . ./cmd.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <dataDir> <langDir|graphDir> <decodeDir>" && exit;
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --cer (0|1)                     # compute CER in addition to WER"
+  exit 1;
+fi
+
+data=$1
+lang=$2 # This parameter is not used -- kept only for backwards compatibility
+dir=$3
+
+set -e
+set -o pipefail
+set -u
+
+ScoringProgram=`which sclite` || ScoringProgram=$KALDI_ROOT/tools/sctk/bin/sclite
+[ ! -x $ScoringProgram ] && echo "Cannot find scoring program at $ScoringProgram" && exit 1;
+SortingProgram=`which hubscr.pl` || SortingProgram=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
+[ ! -x $ScoringProgram ] && echo "Cannot find scoring program at $ScoringProgram" && exit 1;
+
+stm_filter_cmd=cat
+[ -x local/stm_filter ] && stm_filter_cmd=local/stm_filter
+ctm_filter_cmd=cat
+[ -x local/ctm_filter ] && ctm_filter_cmd=local/ctm_filter
+
+for f in $data/stm  ; do
+  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+done
+
+if [ -z $ctm_name ] ; then
+  name=`basename $data`; # e.g. eval2000
+else
+  name=$ctm_name
+fi
+
+if [ $stage -le 0 ] ; then
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    mkdir -p $dir/scoring/penalty_$wip/log
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/penalty_$wip/log/score.LMWT.log \
+      set -e';' set -o pipefail';' \
+      cat $dir/score_LMWT_${wip}/${name}.ctm \| $ctm_filter_cmd '>' $dir/score_LMWT_${wip}/${name}.ctm.unsorted '&&' \
+      cat $data/stm \| $stm_filter_cmd '>' $dir/score_LMWT_${wip}/stm.unsorted '&&' \
+      $SortingProgram sortSTM \<$dir/score_LMWT_${wip}/stm.unsorted          \>$dir/score_LMWT_${wip}/stm.sorted '&&' \
+      $SortingProgram sortCTM \<$dir/score_LMWT_${wip}/${name}.ctm.unsorted  \>$dir/score_LMWT_${wip}/${name}.ctm.sorted '&&' \
+      paste -d ' ' \<\(cut -f 1-5 -d ' ' $dir/score_LMWT_${wip}/stm.sorted \) \
+                   \<\(cut -f 6- -d ' ' $dir/score_LMWT_${wip}/stm.sorted \| uconv -f utf8 -t utf8 -x "$icu_transform" \) \
+          \> $dir/score_LMWT_${wip}/stm '&&' \
+      paste -d ' ' \<\(cut -f 1-4 -d ' ' $dir/score_LMWT_${wip}/${name}.ctm.sorted \) \
+                   \<\(cut -f 5-  -d ' ' $dir/score_LMWT_${wip}/${name}.ctm.sorted \| uconv -f utf8 -t utf8 -x "$icu_transform" \) \
+          \> $dir/score_LMWT_${wip}/${name}.ctm.sorted2 '&&' \
+      utils/fix_ctm.sh $dir/score_LMWT_${wip}/stm $dir/score_LMWT_${wip}/${name}.ctm.sorted2 '&&' \
+      $SortingProgram sortCTM \<$dir/score_LMWT_${wip}/${name}.ctm.sorted2  \>$dir/score_LMWT_${wip}/${name}.ctm '&&' \
+      $ScoringProgram -s -r $dir/score_LMWT_${wip}/stm  stm -h $dir/score_LMWT_${wip}/${name}.ctm ctm \
+        -n "$name.ctm" -f 0 -D -F  -o  sum rsum prf dtl sgml -e utf-8 || exit 1
+  done
+fi
+
+if [ $stage -le 1 ]; then
+  if [ $cer -eq 1 ]; then
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/penalty_$wip/log/score.LMWT.char.log \
+      $ScoringProgram -s -r $dir/score_LMWT_${wip}/stm stm -h $dir/score_LMWT_${wip}/${name}.ctm ctm \
+        -n "$name.char.ctm" -o sum rsum prf dtl sgml -f 0 -D -F -c NOASCII DH -e utf-8 || exit 1
+  fi
+fi
+
+
+echo "Finished scoring on" `date`
+exit 0
diff --git a/egs/material/s5/local/score_wer_segments.sh b/egs/material/s5/local/score_wer_segments.sh
new file mode 100755
index 00000000000..555ec5056d9
--- /dev/null
+++ b/egs/material/s5/local/score_wer_segments.sh
@@ -0,0 +1,100 @@
+#!/bin/bash
+
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+stats=true
+#end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+data=$1
+dir=$2
+
+ref_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter"
+hyp_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
+
+mkdir -p $dir/scoring_kaldi
+if [ -f $data/reftext ]; then
+  cat $data/reftext | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1;
+else
+  echo "$0: No reference text to compute WER" 
+fi
+
+if [ $stage -le 0 ]; then
+
+  mkdir -p $dir/scoring_kaldi/log
+  # begin building hypothesis hyp.txt
+  # in the same format as $data/reftext
+  awk '{a[$1]=a[$1]" "$5;}END{for(i in a)print i""a[i];}' \
+    $dir/score_10/ctm_out > tmpconcat
+  if [ -f $data/reftext ]; then
+    awk -F" " '{print $1}' $data/reftext > tmporder
+    awk 'FNR==NR {x2[$1] = $0; next} $1 in x2 {print x2[$1]}' \
+      tmpconcat tmporder > "$dir/score_10/ctm_out.concat"
+    $hyp_filtering_cmd $dir/score_10/ctm_out.concat > \
+      $dir/scoring_kaldi/hyp.txt || exit 1;
+    # end building hypothesis hyp.txt
+
+    $cmd $dir/scoring_kaldi/log/score.hyp.log \
+      cat $dir/scoring_kaldi/hyp.txt \| \
+      compute-wer --text --mode=present \
+      ark:$dir/scoring_kaldi/test_filt.txt  ark:- ">&" $dir/wer || exit 1;
+
+    cat $dir/wer
+  else
+    cat tmpconcat > "$dir/score_10/ctm_out.concat"
+    awk -F" " '{print $1}' $dir/score_10/ctm_out.concat > tmporder
+    $hyp_filtering_cmd $dir/score_10/ctm_out.concat > \
+      $dir/scoring_kaldi/hyp.txt || exit 1;
+    #exit 0;
+    #end building hypothesis hyp.txt
+
+  fi
+  
+  # building hyp.segmentedXms.txt
+  for dur in {700,800,900,1000}; do                                             
+    dursec=`echo $dur' / 1000' | bc -l`                                         
+    awk '{if ($4 < '$dursec') a[$1]=a[$1]" "$5; else a[$1]=a[$1]" "$5"\n"$1"";}END\
+      {for(i in a)print i""a[i];}' $dir/score_10/ctm_out > tmpconcat          
+    rm -rf $dir/score_10/ctm_out.concat.$dur                                    
+    while read LINE; do                                                         
+    grep "$LINE" "tmpconcat" >> "$dir/score_10/ctm_out.concat."$dur           
+    done < "tmporder"                                                        
+    
+    $hyp_filtering_cmd $dir/score_10/ctm_out.concat.$dur > $dir/scoring_kaldi/hyp.segmented${dur}ms.txt || exit 1;                   
+  done       
+  rm -rf tmpconcat                                                            
+  rm -rf tmporder 
+fi
+
+if [ $stage -le 1 ]; then
+  if $stats; then
+    mkdir -p $dir/scoring_kaldi/wer_details
+
+    $cmd $dir/scoring_kaldi/log/stats1.log \
+      cat $dir/scoring_kaldi/hyp.txt \| \
+      align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \
+      utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \| \
+      utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/stats2.log \
+      cat $dir/scoring_kaldi/wer_details/per_utt \| \
+      utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
+      sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/wer_bootci.log \
+      compute-wer-bootci --mode=present \
+         ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/hyp.txt \
+         '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;
+  fi
+fi
diff --git a/egs/material/s5/local/semisup/chain/decode_test.sh b/egs/material/s5/local/semisup/chain/decode_test.sh
new file mode 100755
index 00000000000..3d9a1eda1f5
--- /dev/null
+++ b/egs/material/s5/local/semisup/chain/decode_test.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+# Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+#           2018  Mahsa Yarmohammadi
+#           2018  Yiming Wang
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+language=swahili
+stage=0
+datadev="data/analysis1 data/analysis2 data/test_dev data/eval1 data/eval2 data/eval3"
+dir=exp/semisup/chain/tdnn_semisup_1a
+lang=data/lang_combined_chain
+tree_dir=exp/semisup/chain/tree_sp
+cmd=queue.pl
+graph_affix=_combined
+
+# training options
+chunk_width=140,100,160
+chunk_left_context=0
+chunk_right_context=0
+
+# ivector options
+max_count=75 # parameter for extract_ivectors.sh
+sub_speaker_frames=600
+filter_ctm=true
+weights_file=
+silence_weight=0.00001
+nj=30
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ ! -f ./conf/lang/${language}.conf ] && \
+  echo "Language configuration conf/lang/${language}.conf does not exist!" && exit 1
+ln -sf ./conf/lang/${language}.conf lang.conf                                   
+. ./lang.conf
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+if [ $stage -le 1 ]; then
+  # extract hires mfcc features from uniformly segmented data
+  for datadir in $datadev; do
+    utils/copy_data_dir.sh ${datadir}_segmented ${datadir}_segmented_hires
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" ${datadir}_segmented_hires || exit 1;
+    steps/compute_cmvn_stats.sh ${datadir}_segmented_hires || exit 1;
+    utils/fix_data_dir.sh ${datadir}_segmented_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # extract iVectors for the test data, in this case we don't need the speed
+  # perturbation (sp).
+  for datadir in $datadev; do
+    data=$(basename $datadir)
+    steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \
+      --silence-weight $silence_weight \
+      --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
+      ${datadir}_segmented_hires $lang exp/nnet3/extractor \
+      exp/nnet3/ivectors_${data}_segmented_hires
+  done
+fi
+
+frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+rm $dir/.error 2>/dev/null || true
+
+if [ $stage -le 3 ]; then
+  # do the 1st pass decoding
+  for datadir in $datadev; do
+    (
+      data=$(basename $datadir)
+      nspk=$(wc -l <data/${data}_segmented_hires/spk2utt)
+      decode_dir=${dir}/decode_${data}_segmented
+      steps/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --extra-left-context $chunk_left_context \
+        --extra-right-context $chunk_right_context \
+        --extra-left-context-initial 0 \
+        --extra-right-context-final 0 \
+        --frames-per-chunk $frames_per_chunk \
+        --skip-scoring true \
+        --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3/ivectors_${data}_segmented_hires \
+        $tree_dir/graph${graph_affix} ${datadir}_segmented_hires ${decode_dir} || exit 1
+
+      # resolve ctm overlaping regions, and compute wer
+      local/postprocess_test.sh ${data}_segmented ${tree_dir}/graph${graph_affix} \
+        ${decode_dir}
+    ) || touch $dir/.error &
+  done
+fi
+wait
+# [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+
+exit 0;
diff --git a/egs/material/s5/local/semisup/chain/run_tdnn.sh b/egs/material/s5/local/semisup/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/material/s5/local/semisup/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/material/s5/local/semisup/chain/run_tdnn_semisupervised.sh b/egs/material/s5/local/semisup/chain/run_tdnn_semisupervised.sh
new file mode 120000
index 00000000000..f1cc0216196
--- /dev/null
+++ b/egs/material/s5/local/semisup/chain/run_tdnn_semisupervised.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_semisupervised_1a.sh
\ No newline at end of file
diff --git a/egs/material/s5/local/semisup/chain/tuning/run_tdnn_1a.sh b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..3d3056182ee
--- /dev/null
+++ b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,275 @@
+#!/bin/bash
+
+# Copyright 2017-2019  Johns Hopkins University (author: Daniel Povey)
+#                2017  Vimal Manohar
+#           2018-2019  Yiming Wang
+# Apache 2.0
+
+set -e
+set -o pipefail
+
+# This system uses phone LM to model UNK.
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train
+test_sets="dev"
+gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+exp_root=exp/semisup
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a   #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration.
+tree_affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+get_egs_stage=-10
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+num_epochs=7
+
+# training options
+srand=0
+remove_egs=true
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage --nj $nj \
+                                  --train-set $train_set --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1
+
+
+gmm_dir=exp/$gmm   # used to get training lattices (for chain supervision)
+tree_dir=$exp_root/chain/tree_sp${tree_affix:+_tree_affix}
+lang_combined=data/lang_combined_chain
+lat_dir=$exp_root/chain/${gmm}_${train_set}_sp_lats  # training lattices directory
+dir=$exp_root/chain/tdnn_${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating lang directory $lang_combined with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang_combined ]; then
+    if [ $lang_combined/L.fst -nt data/lang_combined_test/L.fst ]; then
+      echo "$0: $lang_combined already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang_combined already exists and seems to be older than data/lang_combined_test ..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    rm -rf ${lang_combined} 2>/dev/null || true
+    cp -r data/lang_combined_test $lang_combined
+    silphonelist=$(cat $lang_combined/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang_combined/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang_combined/topo
+  fi
+fi
+
+if [ $stage -le 8 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" \
+    --generate-ali-from-lats true ${lores_train_data_dir} \
+    data/lang_combined $gmm_dir $lat_dir || exit 1;
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 6000 ${lores_train_data_dir} \
+    $lang_combined $lat_dir $tree_dir || exit 1
+fi
+
+
+if [ $stage -le 10 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1024
+  linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1024
+  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1024 input=Append(0,1)
+  linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1024
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1024 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1024
+  linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1024
+  linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1024
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280
+  linear-component name=prefinal-chain-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-chain-batchnorm
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280
+  linear-component name=prefinal-xent-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-xent-batchnorm
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+  
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  mkdir -p $dir/egs
+  touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.stage=$get_egs_stage \
+    --egs.opts="--frames-overlap-per-eg 0 --generate-egs-scp true" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_combined_test \
+    $tree_dir ${tree_dir}/graph_combined || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_combined data/${data}_hires ${dir}/decode_${data} || exit 1
+    ) &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/material/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_1a.sh b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_1a.sh
new file mode 100755
index 00000000000..37c957a3227
--- /dev/null
+++ b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_1a.sh
@@ -0,0 +1,466 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+#           2019  Yiming Wang
+# Apache 2.0
+
+# This script is semi-supervised recipe with ~40 hours of supervised data
+# and ~320 hours unsupervised data with naive splitting.
+# Based on "Semi-Supervised Training of Acoustic Models using Lattice-Free MMI",
+# Vimal Manohar, Hossein Hadian, Daniel Povey, Sanjeev Khudanpur, ICASSP 2018
+# http://www.danielpovey.com/files/2018_icassp_semisupervised_mmi.pdf
+# local/semisup/run_100k.sh shows how to call this.
+
+# This version of script uses only supervised data for i-vector extractor
+# training as against using the combined data.
+
+# This script uses the same tree as that for the seed model.
+# See the comments in the script about how to change these.
+
+# Unsupervised set: eval1_2_3_segmented (320 hours)
+# unsup_frames_per_eg=150
+# Deriv weights: Lattice posterior of best path pdf
+# Unsupervised weight: 1.0
+# Weights for phone LM (supervised, unsupervised): 3,2
+# LM for decoding unsupervised data: ngram
+# Supervision: Naive split lattices
+
+# output-0 and output-1 are for superivsed and unsupervised data respectively.
+
+# Semi-supervised training            train_sup
+# WER on dev                          18.70
+# WER on test                         18.18
+# Final output-0 train prob           -0.1345
+# Final output-0 valid prob           -0.1547
+# Final output-0 train prob (xent)    -1.3683
+# Final output-0 valid prob (xent)    -1.4077
+# Final output-1 train prob           -0.6856
+# Final output-1 valid prob           -0.6815
+# Final output-1 train prob (xent)    -1.1224
+# Final output-1 valid prob (xent)    -1.2218
+
+set -u -e -o pipefail
+
+stage=0   # Start from -1 for supervised seed system training
+nj=30
+test_sets="dev"
+
+exp_root=exp/semisup
+affix=1a  # affix for semi-supervised chain system
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+num_epochs=12
+
+# training options
+srand=0
+remove_egs=true
+
+# Datasets -- Expects data/$supervised_set and data/$unsupervised_set to be
+# present
+supervised_set=train
+unsupervised_set=eval1_2_3_segmented
+
+# Input seed system
+sup_chain_dir=exp/semisup/chain/tdnn_1a_sp  # supervised chain system
+sup_lat_dir=exp/semisup/chain/tri3_${supervised_set}_sp_lats  # Seed model options
+sup_tree_dir=exp/semisup/chain/tree_sp  # tree directory for supervised chain system
+ivector_root_dir=exp/nnet3  # i-vector extractor root directory
+
+# Semi-supervised options
+supervision_weights=1.0,1.0   # Weights for supervised, unsupervised data egs.
+                              # Can be used to scale down the effect of unsupervised data
+                              # by using a smaller scale for it e.g. 1.0,0.3
+lm_weights=3,2  # Weights on phone counts from supervised, unsupervised data for denominator FST creation
+
+sup_egs_dir=   # Supply this to skip supervised egs creation
+unsup_egs_dir=  # Supply this to skip unsupervised egs creation
+unsup_egs_opts=  # Extra options to pass to unsupervised egs creation
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+# The following can be replaced with the versions that model
+# UNK using phone LM. $sup_lat_dir should also ideally be changed.
+unsup_decode_lang=data/lang_combined_test
+test_lang=data/lang_combined_test
+
+dir=$exp_root/chain/tdnn_semisup_${affix}
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+supervised_set_perturbed=${supervised_set}_sp
+
+sup_ivector_dir=$ivector_root_dir/ivectors_${supervised_set_perturbed}_hires
+
+graphdir=$sup_chain_dir/graph_combined
+
+for f in data/${supervised_set_perturbed}/feats.scp \
+  data/${supervised_set_perturbed}_hires/feats.scp \
+  $ivector_root_dir/extractor/final.ie $sup_ivector_dir/ivector_online.scp \
+  $sup_lat_dir/lat.1.gz $sup_tree_dir/ali.1.gz \
+  $unsup_decode_lang/G.fst; do
+  if [ ! -f $f ]; then
+    echo "$0: Could not find file $f"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  if [ ! -f $graphdir/HCLG.fst ]; then
+    utils/mkgraph.sh --self-loop-scale 1.0 $unsup_decode_lang $sup_chain_dir $graphdir
+  fi
+fi
+
+# Prepare the speed-perturbed unsupervised data directory
+if [ $stage -le 2 ]; then
+  if [ -f data/${unsupervised_set}_sp_hires/feats.scp ]; then
+    echo "$0: data/${unsupervised_set}_sp_hires/feats.scp exists. Remove it or re-run from next stage"
+    exit 1
+  fi
+
+  utils/data/perturb_data_dir_speed_3way.sh data/$unsupervised_set data/${unsupervised_set}_sp_hires
+  utils/data/perturb_data_dir_volume.sh data/${unsupervised_set}_sp_hires
+
+  steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \
+    --mfcc-config conf/mfcc_hires.conf data/${unsupervised_set}_sp_hires || exit 1
+fi
+unsupervised_set_perturbed=${unsupervised_set}_sp
+
+# Extract i-vectors for the unsupervised data
+if [ $stage -le 3 ]; then
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${unsupervised_set_perturbed}_hires data/${unsupervised_set_perturbed}_max2_hires
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    data/${unsupervised_set_perturbed}_max2_hires $ivector_root_dir/extractor \
+    $ivector_root_dir/ivectors_${unsupervised_set_perturbed}_hires || exit 1
+fi
+
+# Decode unsupervised data and write lattices in non-compact
+# undeterminized format
+# Set --skip-scoring to false in order to score the unsupervised data
+if [ $stage -le 4 ]; then
+  echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $sup_chain_dir"
+  steps/nnet3/decode_semisup.sh --num-threads 4 --nj $nj --cmd "$decode_cmd" \
+            --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \
+            --extra-left-context $chunk_left_context --extra-right-context $chunk_right_context \
+            --extra-left-context-initial 0 --extra-right-context-final 0 \
+            --frames-per-chunk 150 \
+            --online-ivector-dir $ivector_root_dir/ivectors_${unsupervised_set_perturbed}_hires \
+            --scoring-opts "--min-lmwt 10 --max-lmwt 10" --word-determinize false \
+            $graphdir data/${unsupervised_set_perturbed}_hires $sup_chain_dir/decode_${unsupervised_set_perturbed}
+fi
+
+# Get best path alignment and lattice posterior of best path alignment to be
+# used as frame-weights in lattice-based training
+if [ $stage -le 5 ]; then
+  steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \
+    data/${unsupervised_set_perturbed}_hires \
+    $sup_chain_dir/decode_${unsupervised_set_perturbed} \
+    $sup_chain_dir/best_path_${unsupervised_set_perturbed}
+fi
+
+frame_subsampling_factor=1
+if [ -f $sup_chain_dir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $sup_chain_dir/frame_subsampling_factor)
+fi
+cmvn_opts=$(cat $sup_chain_dir/cmvn_opts) || exit 1
+
+diff $sup_tree_dir/tree $sup_chain_dir/tree || { echo "$0: $sup_tree_dir/tree and $sup_chain_dir/tree differ"; exit 1; }
+
+# Uncomment the following lines if you need to build new tree using both
+# supervised and unsupervised data. This may help if amount of
+# supervised data used to train the seed system tree is very small.
+# unsupervised data
+
+# tree_affix=semisup
+# treedir=$exp_root/chain/tree_sp_${tree_affix}
+# if [ -f $treedir/final.mdl ]; then
+#   echo "$0: $treedir/final.mdl exists. Remove it and run again."
+#   exit 1
+# fi
+#
+# if [ $stage -le 6 ]; then
+#   # This is usually 3 for chain systems.
+#   echo $frame_subsampling_factor > \
+#     $sup_chain_dir/best_path_${unsupervised_set_perturbed}/frame_subsampling_factor
+#
+#   # This should be 1 if using a different source for supervised data alignments.
+#   # However alignments in seed tree directory have already been sub-sampled.
+#   echo $frame_subsampling_factor > \
+#     $sup_tree_dir/frame_subsampling_factor
+#
+#   # Build a new tree using stats from both supervised and unsupervised data
+#   steps/nnet3/chain/build_tree_multiple_sources.sh \
+#     --use-fmllr false --context-opts "--context-width=2 --central-position=1" \
+#     --frame-subsampling-factor $frame_subsampling_factor \
+#     7000 $lang \
+#     data/${supervised_set_perturbed} \
+#     ${sup_tree_dir} \
+#     data/${unsupervised_set_perturbed} \
+#     $chaindir/best_path_${unsupervised_set_perturbed} \
+#     $treedir || exit 1
+# fi
+#
+# sup_tree_dir=$treedir   # Use the new tree dir for further steps
+
+# Train denominator FST using phone alignments from
+# supervised and unsupervised data
+if [ $stage -le 7 ]; then
+  steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \
+    ${sup_tree_dir} ${sup_chain_dir}/best_path_${unsupervised_set_perturbed} \
+    $dir
+fi
+
+if [ $stage -le 8 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1024
+  linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1024
+  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1024 input=Append(0,1)
+  linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1024
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1024 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1024
+  linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1024
+  linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1024
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280
+  linear-component name=prefinal-chain-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-chain-batchnorm
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280
+  linear-component name=prefinal-xent-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-xent-batchnorm
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+ 
+  # We use separate outputs for supervised and unsupervised data
+  # so we can properly track the train and valid objectives.
+
+  output name=output-0 input=output.affine
+  output name=output-1 input=output.affine
+
+  output name=output-0-xent input=output-xent.log-softmax
+  output name=output-1-xent input=output-xent.log-softmax
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+# Get values for $model_left_context, $model_right_context
+. $dir/configs/vars
+
+left_context=$model_left_context
+right_context=$model_right_context
+
+egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)")
+egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)")
+
+if [ -z "$sup_egs_dir" ]; then
+  sup_egs_dir=$dir/egs_${supervised_set_perturbed}
+  frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg)
+
+  if [ $stage -le 9 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$sup_egs_dir/storage $sup_egs_dir/storage
+    fi
+    mkdir -p $sup_egs_dir/
+    touch $sup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the supervised data"
+    steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \
+               --left-context $egs_left_context --right-context $egs_right_context \
+               --left-context-initial $egs_left_context --right-context-final $egs_right_context \
+               --frame-subsampling-factor $frame_subsampling_factor \
+               --alignment-subsampling-factor $frame_subsampling_factor \
+               --frames-per-eg $frames_per_eg \
+               --frames-per-iter 1500000 \
+               --cmvn-opts "$cmvn_opts" \
+               --online-ivector-dir $sup_ivector_dir \
+               --generate-egs-scp true \
+               data/${supervised_set_perturbed}_hires $dir \
+               $sup_lat_dir $sup_egs_dir
+  fi
+else
+  frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg)
+fi
+
+unsup_frames_per_eg=150  # Using a frames-per-eg of 150 for unsupervised data
+                         # was found to be better than allowing smaller chunks
+                         # (160,140,110,80) like for supervised system
+lattice_lm_scale=0.5  # lm-scale for using the weights from unsupervised lattices when
+                      # creating numerator supervision
+lattice_prune_beam=4.0  # beam for pruning the lattices prior to getting egs
+                        # for unsupervised data
+tolerance=1   # frame-tolerance for chain training
+
+unsup_lat_dir=${sup_chain_dir}/decode_${unsupervised_set_perturbed}
+if [ -z "$unsup_egs_dir" ]; then
+  unsup_egs_dir=$dir/egs_${unsupervised_set_perturbed}
+
+  if [ $stage -le 10 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage
+    fi
+    mkdir -p $unsup_egs_dir
+    touch $unsup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    cp $sup_chain_dir/final.mdl $unsup_lat_dir || exit 1;
+
+    echo "$0: generating egs from the unsupervised data"
+    steps/nnet3/chain/get_egs.sh \
+      --cmd "$decode_cmd" --alignment-subsampling-factor 1 \
+      --left-tolerance $tolerance --right-tolerance $tolerance \
+      --left-context $egs_left_context --right-context $egs_right_context \
+      --left-context-initial $egs_left_context --right-context-final $egs_right_context \
+      --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \
+      --frame-subsampling-factor $frame_subsampling_factor \
+      --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \
+      --lattice-prune-beam "$lattice_prune_beam" \
+      --deriv-weights-scp $sup_chain_dir/best_path_${unsupervised_set_perturbed}/weights.scp \
+      --online-ivector-dir $ivector_root_dir/ivectors_${unsupervised_set_perturbed}_hires \
+      --generate-egs-scp true $unsup_egs_opts \
+      data/${unsupervised_set_perturbed}_hires $dir \
+      $unsup_lat_dir $unsup_egs_dir
+  fi
+fi
+
+comb_egs_dir=$dir/comb_egs
+if [ $stage -le 11 ]; then
+  steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \
+    --block-size 128 \
+    --lang2weight $supervision_weights 2 \
+    $sup_egs_dir $unsup_egs_dir $comb_egs_dir
+  touch $comb_egs_dir/.nodelete # keep egs around when that run dies.
+fi
+
+if [ $train_stage -le -4 ]; then
+  # This is to skip stages of den-fst creation, which was already done.
+  train_stage=-4
+fi
+
+if [ $stage -le 12 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --egs.dir="$comb_egs_dir" \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$sup_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=true \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --egs.chunk-width=$frames_per_eg \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.max-param-change=2.0 \
+    --cleanup.remove-egs=false \
+    --feat-dir=data/${supervised_set_perturbed}_hires \
+    --tree-dir=$sup_tree_dir \
+    --lat-dir=$sup_lat_dir \
+    --dir=$dir || exit 1;
+fi
+
+test_graph_dir=$dir/graph_combined
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir
+fi
+
+if [ $stage -le 14 ]; then
+  frames_per_chunk=150
+  rm -f $dir/.error 2>/dev/null || true
+  for data in $test_sets; do
+      (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir ${ivector_root_dir}/ivectors_${data}_hires \
+          $test_graph_dir data/${data}_hires ${dir}/decode_${data} || touch $dir/.error
+      ) &
+  done
+  wait;
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/material/s5/local/semisup/rnnlm/run_tdnn_lstm.sh b/egs/material/s5/local/semisup/rnnlm/run_tdnn_lstm.sh
new file mode 100755
index 00000000000..8fb570ea153
--- /dev/null
+++ b/egs/material/s5/local/semisup/rnnlm/run_tdnn_lstm.sh
@@ -0,0 +1,178 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#                2017  Hainan Xu
+#           2018-2019  Yiming Wang
+
+
+# [for swahili]
+# %WER 34.5 | 9906 59164 | 68.1 16.9 15.0 2.6 34.5 47.0 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys
+# %WER 30.0 | 5322 37120 | 72.3 15.1 12.6 2.2 30.0 47.5 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys
+
+# [for tagalog]
+# %WER 40.1 | 10551 87329 | 63.9 19.4 16.6 4.0 40.1 63.6 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys
+# %WER 40.6 | 5933 56887 | 63.5 18.6 17.9 4.1 40.6 71.7 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys
+
+# [for somali]
+# %WER 48.8 | 9852 90609 | 59.1 28.6 12.3 7.8 48.8 73.4 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys
+# %WER 48.2 | 8275 67640 | 60.0 28.3 11.6 8.2 48.2 68.3 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys
+
+# Begin configuration section.
+
+embedding_dim=512
+lstm_rpd=128
+lstm_nrpd=128
+stage=0
+train_stage=-10
+epochs=40
+
+# variables for lattice rescoring
+run_rescore=true
+decode_dir_suffix=rnnlm
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
+
+ac_model_dir=exp/semisup/chain/tdnn_semisup_1a
+decode_sets="dev analysis1_segmented analysis2_segmented test_dev_segmented eval1_segmented eval2_segmented eval3_segmented"
+
+dir=exp/semisup/rnnlm_lstm_1a
+text_dir=data/rnnlm/text
+train_text=data/lm/train.txt
+dev_text=data/lm/dev.txt
+bitext=data/bitext/text.txt
+monotext=data/mono/text.txt
+
+lang=data/lang_combined_chain
+tree_dir=exp/semisup/chain/tree_sp
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+
+mkdir -p $dir/config
+set -e
+
+for f in ${train_text} ${dev_text} $bitext $monotext; do
+
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; look at stage 12 in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  cat $train_text > $text_dir/train.txt
+  cat $dev_text > $text_dir/dev.txt
+  cat $bitext > $text_dir/bitext.txt
+  cat $monotext > $text_dir/monotext.txt
+
+fi
+
+if [ $stage -le 1 ]; then
+  cp $lang/words.txt $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+train   1   1.0
+bitext  1   1.0
+monotext  1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<brk>,<unk>,<noise>,<spnoise>,<sil>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-2))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-1))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 1 --embedding-l2 0.001 \
+                  --stage $train_stage --num-epochs $epochs --cmd "$train_cmd" $dir
+fi
+
+LM=combined_chain
+if [ $stage -le 4 ] && $run_rescore; then
+  echo "$0: Perform lattice-rescoring on $ac_model_dir"
+  pruned=
+  if $pruned_rescore; then
+    pruned=_pruned
+  fi
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore$pruned.sh \
+        --cmd "$decode_cmd" \
+        --weight 0.5 --max-ngram-order $ngram_order --max-arcs 20000 \
+        --skip-scoring ${skip_scoring} \
+        data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir} ${decode_dir}_${decode_dir_suffix}_rescore || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore
+      fi
+    ) || touch $dir/.error &
+  done
+fi
+wait
+#[ -f $dir/.error ] && echo "$0: there was a problem while rescoring" && exit 1
+
+if [ $stage -le 5 ]; then
+  echo "$0: Perform nbest-rescoring on $ac_model_dir"
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore_nbest.sh \
+        --N 20 \
+        --cmd "$decode_cmd" \
+        --skip-scoring ${skip_scoring} \
+        0.5 data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir}_${decode_dir_suffix}_rescore ${decode_dir}_${decode_dir_suffix}_rescore_nbest || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore_nbest
+      fi
+    ) || touch $dir/.error 
+  done
+fi
+
+exit 0
diff --git a/egs/material/s5/local/semisup/run.sh b/egs/material/s5/local/semisup/run.sh
new file mode 100755
index 00000000000..6b22cb1ad36
--- /dev/null
+++ b/egs/material/s5/local/semisup/run.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+#           2019  Yiming Wang
+# Apache 2.0
+
+# This script demonstrates semi-supervised training using ~40 hours of
+# supervised data and ~320 hours of unsupervised data.
+
+. ./cmd.sh
+. ./path.sh 
+
+set -o pipefail
+exp_root=exp/semisup
+
+stage=0
+
+. ./utils/parse_options.sh
+
+###############################################################################
+# Train seed chain system using ~40 hours supervised data.
+# Here we train i-vector extractor on only the supervised set.
+###############################################################################
+
+if [ $stage -le 1 ]; then
+  local/semisup/chain/run_tdnn.sh \
+    --train-set train \
+    --nnet3-affix "" \
+    --affix 1a --tree-affix "" \
+    --gmm tri3 --exp-root $exp_root || exit 1
+fi
+
+if [ $stage -le 2 ]; then
+    utils/combine_data.sh data/eval1_2_3_segmented data/eval1_segmented data/eval2_segmented data/eval3_segmented || exit 1
+fi
+
+###############################################################################
+# Semi-supervised training using ~40 hours supervised data and
+# 320 hours unsupervised data. We use i-vector extractor, tree, lattices
+# and seed chain system from the previous stage.
+###############################################################################
+
+if [ $stage -le 3 ]; then
+  local/semisup/chain/run_tdnn_semisupervised.sh \
+    --supervised-set train \
+    --unsupervised-set eval1_2_3_segmented \
+    --sup-chain-dir $exp_root/chain/tdnn_1a_sp \
+    --sup-lat-dir $exp_root/chain/tri3_train_sp_lats \
+    --sup-tree-dir $exp_root/chain/tree_sp \
+    --ivector-root-dir exp/nnet3 \
+    --affix 1a \
+    --exp-root $exp_root || exit 1
+
+  # [for swahili]
+  # %WER 35.2 | 9906 59164 | 67.8 18.4 13.8 3.0 35.2 47.1 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis1_segmented/score_10_0.0/analysis1_segmented_hires.ctm.sys
+  # %WER 30.8 | 5322 37120 | 71.9 16.4 11.8 2.7 30.8 47.8 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis2_segmented/score_10_0.0/analysis2_segmented_hires.ctm.sys
+
+  # [for tagalog]
+  # %WER 40.8 | 10551 87329 | 64.0 21.4 14.6 4.8 40.8 63.9 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis1_segmented/score_10_0.0/analysis1_segmented_hires.ctm.sys
+  # %WER 41.1 | 5933 56887 | 63.8 20.4 15.9 4.9 41.1 71.9 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis2_segmented/score_10_0.0/analysis2_segmented_hires.ctm.sys
+fi
+
diff --git a/egs/material/s5/local/stm_filter b/egs/material/s5/local/stm_filter
new file mode 100755
index 00000000000..9409119a54f
--- /dev/null
+++ b/egs/material/s5/local/stm_filter
@@ -0,0 +1,22 @@
+#!/usr/bin/perl
+
+while (<>) {
+  chomp;
+  my @F = split;
+  my @A = @F[6..$#F];
+  for (my $i = 0; $i <= $#A; $i++) {
+    my $w = $A[$i];
+
+    # Make partial words optionally detectable
+    if ($w =~ m/^(\S+-)$/ || $w =~ m/^(-\S+)$/) {
+      $A[$i] = "(" . $w . ")";
+    }
+    
+    # Remove filler words
+    if ($w =~ m/<(unk|noise|spnoise|sil)>/) {
+      $A[$i] = "";
+    }
+  }
+    
+  print join(" ", @F[0..5]) . " " . join(" ", @A) . "\n";
+}
diff --git a/egs/material/s5/local/train_lms_srilm.sh b/egs/material/s5/local/train_lms_srilm.sh
new file mode 100755
index 00000000000..8160b060dc7
--- /dev/null
+++ b/egs/material/s5/local/train_lms_srilm.sh
@@ -0,0 +1,224 @@
+#!/bin/bash
+export LC_ALL=C
+
+words_file=
+train_text=
+dev_text=
+oov_symbol="<UNK>"
+
+echo "$0 $@"
+
+[ -f path.sh ]  && . ./path.sh
+. ./utils/parse_options.sh || exit 1
+
+echo "-------------------------------------"
+echo "Building an SRILM language model     "
+echo "-------------------------------------"
+
+if [ $# -ne 2 ] ; then
+  echo "Incorrect number of parameters. "
+  echo "Script has to be called like this:"
+  echo "  $0 [switches] <datadir> <tgtdir>"
+  echo "For example: "
+  echo "  $0 data data/srilm"
+  echo "The allowed switches are: "
+  echo "    words_file=<word_file|>        word list file -- data/lang/words.txt by default"
+  echo "    train_text=<train_text|>       data/train/text is used in case when not specified"
+  echo "    dev_text=<dev_text|>           last 10 % of the train text is used by default"
+  echo "    oov_symbol=<unk_sumbol|<UNK>>  symbol to use for oov modeling -- <UNK> by default"
+  exit 1
+fi
+
+datadir=$1
+tgtdir=$2
+outlm=lm.gz
+
+
+##End of configuration
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=`pwd`/../../../tools/srilm/bin/i686-m64
+  else
+    sdir=`pwd`/../../../tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+# Prepare the destination directory
+mkdir -p $tgtdir
+
+for f in $words_file $train_text $dev_text; do
+  [ ! -s $f ] && echo "No such file $f" && exit 1;
+done
+
+[ -z $words_file ] && words_file=$datadir/lang/words.txt
+if [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+  train_text=$train_text
+  dev_text=$dev_text
+else
+  [ -z "$train_text" ] && train_text=$datadir/train/text
+  nr=`cat  $train_text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+  orig_train_text=$train_text
+  head -n $nr_train $train_text > $tgtdir/train_text
+  tail -n $nr_dev $train_text > $tgtdir/dev_text
+
+  train_text=$tgtdir/train_text
+  dev_text=$tgtdir/dev_text
+  echo "Using words file: $words_file"
+  echo "Using train text: 9/10 of $orig_train_text"
+  echo "Using dev text  : 1/10 of $orig_train_text"
+fi
+
+
+
+# Extract the word list from the training dictionary; exclude special symbols
+sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '<eps>' | grep -v -F "$oov_symbol" > $tgtdir/vocab
+if (($?)); then
+  echo "Failed to create vocab from $words_file"
+  exit 1
+else
+  # wc vocab # doesn't work due to some encoding issues
+  echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+cat $train_text | cut -f2- -d' ' > $tgtdir/train.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/train.txt from $train_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $train_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+cat $dev_text | cut -f2- -d' ' > $tgtdir/dev.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/dev.txt from $dev_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $dev_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F;  $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+echo "-------------------"
+echo "Good-Turing 2grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/2gram.gt01.gz -gt1min 0 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/2gram.gt02.gz -gt1min 0 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 2grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/2gram.kn01.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/2gram.kn02.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Good-Turing 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+
+echo "-------------------"
+echo "Good-Turing 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.gt0111.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0112.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0122.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0123.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0113.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0222.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0223.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.kn0111.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0112.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0113.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0122.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0123.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0222.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0223.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+if [ ! -z ${LIBLBFGS} ]; then
+  #please not that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault
+  #instead of that, we simply output the model in the maxent format and convert it using the "ngram"
+  echo "-------------------"
+  echo "Maxent 2grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 2 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/2gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 3grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 4grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1
+
+fi
+
+
+echo "--------------------"
+echo "Computing perplexity"
+echo "--------------------"
+(
+  for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+  for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+)  | sort  -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt
+
+echo "The perlexity scores report is stored in $tgtdir/perplexities.txt "
+
+#This will link the lowest perplexity LM as the output LM.
+#ln -sf $tgtdir/`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '` $outlm
+
+#A slight modification of the previous approach:
+#We look at the two lowest perplexity LMs and use a 3gram LM if one of the two, even if the 4gram is of lower ppl
+nof_trigram_lm=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | wc -l`
+if [[ $nof_trigram_lm -eq 0 ]] ; then
+  lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '`
+elif [[ $nof_trigram_lm -eq 2 ]] ; then
+  lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '`
+else  #exactly one 3gram LM
+  lmfilename=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | cut -f 1 -d ' '`
+fi
+(cd $tgtdir; ln -sf `basename $lmfilename` $outlm )
+
diff --git a/egs/material/s5/local/wer_output_filter b/egs/material/s5/local/wer_output_filter
new file mode 100755
index 00000000000..5195bb9150d
--- /dev/null
+++ b/egs/material/s5/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+  @F = split " ";
+  print $F[0] . " "; 
+  foreach $s (@F[1..$#F]) {
+    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL") || ($s =~ /--|\.|\?|\(\(\)\)|%incomplete/)) {
+      print "";
+    } else {
+      print "$s"
+    }
+    print " ";
+  }
+  print "\n";
+}
+
+
diff --git a/egs/material/s5/path.sh b/egs/material/s5/path.sh
new file mode 100644
index 00000000000..ffa108b6737
--- /dev/null
+++ b/egs/material/s5/path.sh
@@ -0,0 +1,7 @@
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5/:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+[ ! -f $KALDI_ROOT/tools/env.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/env.sh is not present (this is uncommon but might be OK)"
+. $KALDI_ROOT/tools/env.sh
+export LC_ALL=C
diff --git a/egs/material/s5/rnnlm b/egs/material/s5/rnnlm
new file mode 120000
index 00000000000..72302c5e570
--- /dev/null
+++ b/egs/material/s5/rnnlm
@@ -0,0 +1 @@
+../../../scripts/rnnlm
\ No newline at end of file
diff --git a/egs/material/s5/run.sh b/egs/material/s5/run.sh
new file mode 100755
index 00000000000..4ba518f53e0
--- /dev/null
+++ b/egs/material/s5/run.sh
@@ -0,0 +1,322 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+#           2017-2018  Johns Hopkins University (author: Daniel Povey)
+#                2018  Yiming Wang
+#                2019  Mahsa Yarmohammadi
+# License: Apache 2.0
+
+. ./path.sh
+. ./cmd.sh
+
+nj=30 # number of parallel jobs
+stage=1
+language=swahili
+. utils/parse_options.sh
+
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+[ ! -f ./conf/lang/${language}.conf ] && \
+  echo "Language configuration conf/lang/${language}.conf does not exist!" && exit 1
+ln -sf ./conf/lang/${language}.conf lang.conf
+. ./lang.conf
+
+if [ $stage -le 1 ]; then
+  local/prepare_text_data.sh $corpus
+  local/prepare_audio_data.sh $corpus
+fi
+
+if [ $stage -le 2 ]; then
+  local/prepare_dict.sh $corpus
+  utils/validate_dict_dir.pl data/local/dict_nosp
+  utils/prepare_lang.sh data/local/dict_nosp \
+    "<unk>" data/local/lang_nosp data/lang_nosp
+  utils/validate_lang.pl data/lang_nosp
+fi
+
+if [ $stage -le 3 ]; then
+  local/train_lms_srilm.sh --oov-symbol "<unk>" --words-file \
+    data/lang_nosp/words.txt data data/lm
+  utils/format_lm.sh data/lang_nosp data/lm/lm.gz \
+    data/local/dict_nosp/lexiconp.txt data/lang_nosp_test
+  utils/validate_lang.pl data/lang_nosp_test
+fi
+
+if [ $stage -le 4 ]; then
+  for set in train dev; do
+    dir=data/$set
+    utils/fix_data_dir.sh $dir
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 16 $dir
+    steps/compute_cmvn_stats.sh $dir
+    utils/fix_data_dir.sh $dir
+    utils/validate_data_dir.sh $dir
+  done
+fi
+
+# Create a subset with 40k short segments to make flat-start training easier
+if [ $stage -le 5 ]; then
+  utils/subset_data_dir.sh --shortest data/train $numShorestUtts data/train_short
+fi
+
+# monophone training
+if [ $stage -le 6 ]; then
+  steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
+    data/train_short data/lang_nosp_test exp/mono
+  (
+    utils/mkgraph.sh data/lang_nosp_test \
+      exp/mono exp/mono/graph_nosp
+    for test in dev; do
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/mono/graph_nosp \
+        data/$test exp/mono/decode_nosp_$test
+    done
+  )&
+
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp_test exp/mono exp/mono_ali
+fi
+
+# train a first delta + delta-delta triphone system on all utterances
+if [ $stage -le 7 ]; then
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    $numLeavesTri1 $numGaussTri1 data/train data/lang_nosp_test exp/mono_ali exp/tri1
+
+  # decode using the tri1 model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri1 exp/tri1/graph_nosp
+    for test in dev; do
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri1/graph_nosp \
+        data/$test exp/tri1/decode_nosp_$test
+    done
+  )&
+
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp_test exp/tri1 exp/tri1_ali
+fi
+
+# train an LDA+MLLT system.
+if [ $stage -le 8 ]; then
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=3 --right-context=3" $numLeavesTri2 $numGaussTri2 \
+    data/train data/lang_nosp_test exp/tri1_ali exp/tri2
+
+  # decode using the LDA+MLLT model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri2 exp/tri2/graph_nosp
+    for test in dev; do
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri2/graph_nosp \
+        data/$test exp/tri2/decode_nosp_$test
+    done
+  )&
+
+  steps/align_si.sh  --nj $nj --cmd "$train_cmd" --use-graphs true \
+    data/train data/lang_nosp_test exp/tri2 exp/tri2_ali
+fi
+
+# Train tri3, which is LDA+MLLT+SAT
+if [ $stage -le 9 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" $numLeavesTri3 $numGaussTri3 \
+    data/train data/lang_nosp_test exp/tri2_ali exp/tri3
+
+  # decode using the tri3 model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
+    for test in dev; do
+      steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" exp/tri3/graph_nosp \
+        data/$test exp/tri3/decode_nosp_$test
+    done
+  )&
+fi
+
+# Now we compute the pronunciation and silence probabilities from training data,
+# and re-create the lang directory.
+if [ $stage -le 10 ]; then
+  steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_nosp_test exp/tri3
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    data/local/dict_nosp \
+    exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
+    exp/tri3/pron_bigram_counts_nowb.txt data/local/dict
+
+  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
+
+  utils/format_lm.sh data/lang data/lm/lm.gz \
+    data/local/dict/lexiconp.txt data/lang_test
+
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_test exp/tri3 exp/tri3_ali
+fi
+
+if [ $stage -le 11 ]; then
+  # Test the tri3 system with the silprobs and pron-probs.
+
+  # decode using the tri3 model
+  utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph
+  for test in dev; do
+    steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" \
+      exp/tri3/graph data/$test exp/tri3/decode_$test
+  done
+fi
+
+mkdir -p data/bitext
+mkdir -p data/mono
+
+srctext_bitext=data/bitext/text
+srctext_mono=data/mono/text
+
+if [ $stage -le 12 ]; then
+  # Read the foreign part of the bitext as $srctext_bitext and preprocess the text
+  if [ "$number_mapping" != "" ]; then
+    echo Number mapping file Found. Converting numbers...
+    cat $bitext | awk -F"\t" '{print $2;}' | local/normalize_numbers.py $number_mapping > $srctext_bitext
+    if [[ $mono == *.gz ]]; then 
+      gzip -cd $mono | local/normalize_numbers.py $number_mapping > $srctext_mono
+    else
+      cat $mono | local/normalize_numbers.py $number_mapping > $srctext_mono
+    fi
+    if [ "$mono2" != "" ]; then
+      if [[ $mono2 == *.gz ]]; then 
+        gzip -cd $mono2 | local/normalize_numbers.py $number_mapping >> $srctext_mono
+      else
+        cat $mono2 | local/normalize_numbers.py $number_mapping >> $srctext_mono
+      fi
+    fi
+  else
+    cat $bitext | awk -F"\t" '{print $2;}' > $srctext_bitext
+    if [[ $mono == *.gz ]]; then
+      gzip -cd $mono > $srctext_mono
+    else
+      cat $mono > $srctext_mono
+    fi
+    if [ "$mono2" != "" ]; then
+      if [[ $mono2 == *.gz ]]; then 
+        gzip -cd $mono2 >> $srctext_mono
+      else
+        cat $mono2 >> $srctext_mono
+      fi
+    fi
+  fi
+
+  local/preprocess_external_text.sh --language $language \
+    --srctext-bitext ${srctext_bitext} ${srctext_bitext}.txt
+
+  local/preprocess_external_text.sh --language $language \
+    --srctext-bitext ${srctext_mono} ${srctext_mono}.txt
+
+  # Combine two sources of text
+  cat $bitext | awk '{print $1}' > ${srctext_bitext}.header
+  paste ${srctext_bitext}.header ${srctext_bitext}.txt > ${srctext_bitext}.processed
+
+  if [[ $mono == *.gz ]]; then
+    gzip -cd $mono | awk '{printf("mono-%d\n",NR)}' > ${srctext_mono}.header
+  else
+    cat $mono | awk '{printf("mono-%d\n",NR)}' > ${srctext_mono}.header
+  fi
+  if [ "$mono2" != "" ]; then
+    if [[ $mono2 == *.gz ]]; then 
+      gzip -cd $mono2 | awk '{printf("mono-%d\n",NR)}' >> ${srctext_mono}.header
+    else
+      cat $mono2 | awk '{printf("mono-%d\n",NR)}' >> ${srctext_mono}.header
+    fi
+  fi
+  paste ${srctext_mono}.header ${srctext_mono}.txt > ${srctext_mono}.processed
+fi
+
+# The next 3 stages are to train g2p from the existing lexicon,
+# apply g2p to expand the lexicon using oov words from bitext data
+# as in ${dict_root}_nosp.
+g2p_workdir=data/local/g2p_phonetisarus
+if [ $stage -le 13 ]; then
+  echo 'Gathering missing words...'
+  mkdir -p ${g2p_workdir}
+  cat ${srctext_bitext}.txt ${srctext_mono}.txt | \
+    local/count_oovs.pl data/local/dict_nosp/lexicon.txt | \
+    awk '{for(i=4; i<NF; i++) printf "%s",$i OFS; if(NF) printf "%s",$NF; printf ORS}' | \
+    perl -ape 's/\s/\n/g;' | \
+    sort | uniq > ${g2p_workdir}/missing.txt
+  cat ${g2p_workdir}/missing.txt | \
+    grep "^[a-z]*$"  > ${g2p_workdir}/missing_onlywords.txt
+fi
+
+if [ $stage -le 14 ]; then
+  local/g2p/train_g2p.sh --stage 0 --silence-phones \
+    "data/local/dict/silence_phones.txt" data/local/dict_nosp exp/g2p || touch exp/g2p/.error
+fi
+
+dict_root=data/local/dict_combined
+if [ $stage -le 15 ]; then
+  if [ -f exp/g2p/.error ]; then
+    rm exp/g2p/.error || true
+    echo "Fail to train the G2P model." && exit 1;
+  fi
+  mkdir -p ${dict_root}_nosp
+  rm ${dict_root}_nosp/lexiconp.txt 2>/dev/null || true
+  cp data/local/dict_nosp/{phones,oov,nonsilence_phones,silence_phones,optional_silence}.txt ${dict_root}_nosp
+  local/g2p/apply_g2p.sh --var-counts 1 exp/g2p/model.fst ${g2p_workdir} \
+  data/local/dict_nosp/lexicon.txt ${dict_root}_nosp/lexicon.txt || exit 1;
+
+  utils/validate_dict_dir.pl ${dict_root}_nosp
+fi
+
+lang_root=data/lang_combined
+lmdir=data/lm_combined
+if [ $stage -le 16 ]; then
+  utils/prepare_lang.sh ${dict_root}_nosp "<unk>" data/local/lang_combined_nosp ${lang_root}_nosp
+  utils/validate_lang.pl ${lang_root}_nosp
+fi
+
+# prepare the new LM with bitext data and the new lexicon,
+# as in the new test lang directory ${lang_root}_nosp_test
+
+datadev="data/analysis1 data/analysis2 data/test_dev data/eval1 data/eval2 data/eval3"
+
+if [ $stage -le 17 ]; then
+  for datadir in $datadev; do
+    local/preprocess_test.sh $datadir &
+  done
+  wait
+
+  mkdir -p $lmdir
+  mkdir -p $lmdir/mono
+  mkdir -p $lmdir/bitext
+
+  cat data/analysis1/text | awk '{for(i=2;i<=NF;i++) printf("%s ", $i); print""}' \
+    | grep . | shuf | head -n 2000 > $lmdir/dev_text || echo done
+
+  local/train_lms_srilm.sh --oov-symbol "<unk>" --words-file ${lang_root}_nosp/words.txt \
+    --train-text ${srctext_bitext}.processed --dev-text $lmdir/dev_text \
+    data $lmdir/bitext
+
+  local/train_lms_srilm.sh --oov-symbol "<unk>" --words-file ${lang_root}_nosp/words.txt \
+    --train-text ${srctext_mono}.processed --dev-text $lmdir/dev_text \
+    data $lmdir/mono
+fi
+
+if [ $stage -le 18 ]; then
+  ngram -order 4 -lm data/lm/lm.gz -mix-lm $lmdir/bitext/lm.gz \
+    -mix-lm2 $lmdir/mono/lm.gz -lambda 0.3 -mix-lambda2 0.4 \
+    -write-lm $lmdir/lm.gz
+
+  utils/format_lm.sh ${lang_root}_nosp $lmdir/lm.gz \
+    ${dict_root}_nosp/lexiconp.txt ${lang_root}_nosp_test
+  utils/validate_lang.pl ${lang_root}_nosp_test
+fi
+
+# Now we compute the pronunciation and silence probabilities from training data,
+# and re-create the lang directory ${lang_root}_test.
+if [ $stage -le 19 ]; then
+  steps/get_prons.sh --cmd "$train_cmd" data/train ${lang_root}_nosp_test exp/tri3
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    ${dict_root}_nosp \
+    exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
+    exp/tri3/pron_bigram_counts_nowb.txt ${dict_root}
+  utils/prepare_lang.sh ${dict_root} "<unk>" data/local/lang_combined ${lang_root}
+
+  utils/format_lm.sh ${lang_root} $lmdir/lm.gz \
+    ${dict_root}/lexiconp.txt ${lang_root}_test
+fi
+
+# After run.sh is finished, run the followings:
+# ./local/chain/run_tdnn.sh
+# ./local/chain/decode_test.sh --language <swahili|tagalog|somali>
+# ./local/rnnlm/run_tdnn_lstm.sh
+exit 0;
diff --git a/egs/material/s5/steps b/egs/material/s5/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/material/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/material/s5/utils b/egs/material/s5/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/material/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/data/subsegment_data_dir.sh b/egs/wsj/s5/utils/data/subsegment_data_dir.sh
index 526fee0b4ef..1b399ba730a 100755
--- a/egs/wsj/s5/utils/data/subsegment_data_dir.sh
+++ b/egs/wsj/s5/utils/data/subsegment_data_dir.sh
@@ -222,8 +222,11 @@ fi
 if [ -f $srcdir/glm ]; then
   cp $srcdir/glm $dir
 fi
+if [ -f $srcdir/stm ]; then
+  cp $srcdir/stm $dir
+fi
 
-for f in stm ctm; do
+for f in ctm; do
   if [ -f $srcdir/$f ]; then
     echo "$0: not copying $srcdir/$f to $dir because sub-segmenting it is "
     echo " ... not implemented yet (and probably it's not needed.)"
diff --git a/egs/wsj/s5/utils/validate_dict_dir.pl b/egs/wsj/s5/utils/validate_dict_dir.pl
index 8f8534c329b..209f9fd40c1 100755
--- a/egs/wsj/s5/utils/validate_dict_dir.pl
+++ b/egs/wsj/s5/utils/validate_dict_dir.pl
@@ -35,7 +35,7 @@ sub get_utf8_or_bytestream {
       $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
       push @unicode_lines, $decoded_text;
     } else {
-      #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
+      #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
       ;
     }
     push @raw_lines, $raw_text;
diff --git a/scripts/rnnlm/lmrescore_nbest.sh b/scripts/rnnlm/lmrescore_nbest.sh
index 58b19b9fa79..a22d43961ab 100755
--- a/scripts/rnnlm/lmrescore_nbest.sh
+++ b/scripts/rnnlm/lmrescore_nbest.sh
@@ -58,7 +58,7 @@ elif [ ! -f $oldlm ]; then
     exit 1;
 fi
 
-for f in $rnndir/final.raw $data/feats.scp $indir/lat.1.gz; do
+for f in $rnndir/final.raw $indir/lat.1.gz; do
   [ ! -f $f ] && echo "$0: expected file $f to exist." && exit 1;
 done
 
@@ -174,6 +174,7 @@ if [ $stage -le 5 ]; then
       $adir.$n/lmwt.lmonly || exit 1;
   done
 fi
+
 if [ $stage -le 6 ]; then
   echo "$0: invoking rnnlm/compute_sentence_scores.sh which calls rnnlm to get RNN LM scores."
   $cmd JOB=1:$nj $dir/log/rnnlm_compute_scores.JOB.log \
diff --git a/scripts/rnnlm/lmrescore_pruned.sh b/scripts/rnnlm/lmrescore_pruned.sh
index 9ba78415708..b6ec694ffd4 100755
--- a/scripts/rnnlm/lmrescore_pruned.sh
+++ b/scripts/rnnlm/lmrescore_pruned.sh
@@ -26,7 +26,7 @@ normalize=false # If true, we add a normalization step to the output of the RNNL
                 # as in our RNNLM setup, a properly trained network would automatically
                 # have its normalization term close to 1. The details of this
                 # could be found at http://www.danielpovey.com/files/2018_icassp_rnnlm.pdf
-lattice_prune_beam=4 # Beam used in pruned lattice composition
+lattice_prune_beam=8 # Beam used in pruned lattice composition
                      # This option affects speed and how large the composed lattice may be
 
 # End configuration section.
diff --git a/src/lat/compose-lattice-pruned.cc b/src/lat/compose-lattice-pruned.cc
index 57a7432dca0..cc71db38eab 100644
--- a/src/lat/compose-lattice-pruned.cc
+++ b/src/lat/compose-lattice-pruned.cc
@@ -658,6 +658,7 @@ void PrunedCompactLatticeComposer::AddFirstState() {
   composed_state_queue_.push(
       std::pair<BaseFloat, int32>(expected_cost_offset,
                                   state_id));  // actually (0.0, 0).
+
 }