From b93789fff00a18563275374e1beed4ac30272df8 Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Fri, 21 Dec 2018 20:20:04 -0500
Subject: [PATCH 01/13] updating gale setup

---
 egs/gale_arabic/s5b/local/chain/copare_wer.sh |  72 ++++++
 .../s5b/local/chain/run_chain_common.sh       |  82 ++++++
 .../s5b/local/chain/tuning/run_tdnn_1b.sh     | 220 ++++++++++++++++
 .../s5b/local/gale_data_prep_audio.sh         |  32 ---
 .../s5b/local/gale_data_prep_split.sh         |  39 ---
 .../s5b/local/gale_data_prep_txt.sh           |  60 -----
 egs/gale_arabic/s5b/local/gale_format_data.sh |  60 -----
 egs/gale_arabic/s5b/local/gale_train_lms.sh   |  81 ------
 .../s5b/local/nnet3/run_ivector_common.sh     | 136 +++-------
 egs/gale_arabic/s5b/local/prepare_data.sh     | 105 ++++++++
 ..._prep_grapheme_dict.sh => prepare_dict.sh} |  26 +-
 egs/gale_arabic/s5b/local/prepare_lm.sh       |  46 ++++
 egs/gale_arabic/s5b/local/score.sh            |  60 +----
 egs/gale_arabic/s5b/run.sh                    | 239 +++++++-----------
 14 files changed, 664 insertions(+), 594 deletions(-)
 create mode 100755 egs/gale_arabic/s5b/local/chain/copare_wer.sh
 create mode 100755 egs/gale_arabic/s5b/local/chain/run_chain_common.sh
 create mode 100755 egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh
 delete mode 100755 egs/gale_arabic/s5b/local/gale_data_prep_audio.sh
 delete mode 100755 egs/gale_arabic/s5b/local/gale_data_prep_split.sh
 delete mode 100755 egs/gale_arabic/s5b/local/gale_data_prep_txt.sh
 delete mode 100755 egs/gale_arabic/s5b/local/gale_format_data.sh
 delete mode 100755 egs/gale_arabic/s5b/local/gale_train_lms.sh
 create mode 100755 egs/gale_arabic/s5b/local/prepare_data.sh
 rename egs/gale_arabic/s5b/local/{gale_prep_grapheme_dict.sh => prepare_dict.sh} (61%)
 create mode 100755 egs/gale_arabic/s5b/local/prepare_lm.sh
diff --git a/egs/gale_arabic/s5b/local/chain/copare_wer.sh b/egs/gale_arabic/s5b/local/chain/copare_wer.sh
new file mode 100755
index 00000000000..1a40523355a
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/chain/copare_wer.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# ./local/chain/compare_wer.sh exp/chain/cnn1a
+# System                          cnn1a
+# WER                              0.61
+# CER                              0.15
+# Final train prob              -0.0377
+# Final valid prob              -0.0380
+# Final train prob (xent)       -0.0830
+# Final valid prob (xent)       -0.0838
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/gale_arabic/s5b/local/chain/run_chain_common.sh b/egs/gale_arabic/s5b/local/chain/run_chain_common.sh
new file mode 100755
index 00000000000..da37e148441
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/chain/run_chain_common.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# this script has common stages shared across librispeech chain recipes.
+# It generates a new topology in a new lang directory, gets the alignments as
+# lattices, and builds a tree for the new topology
+set -e
+
+stage=11
+
+# input directory names. These options are actually compulsory, and they have
+# been named for convenience
+gmm_dir=
+ali_dir=
+lores_train_data_dir=
+
+num_leaves=6000
+
+# output directory names. They are also compulsory.
+lang=
+lat_dir=
+tree_dir=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1;
+[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1;
+[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1;
+
+for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 12 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat ${ali_dir}/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \
+    $lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology. We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..caa0d9d805e
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh
@@ -0,0 +1,220 @@
+#!/bin/bash
+
+# ./local/chain/compare_wer.sh exp/chain/tdnn_1b/
+# System                      tdnn_1b
+# WER                             17.23
+# CER                              6.83
+# Final train prob              -0.0825
+# Final valid prob              -0.0987
+# Final train prob (xent)       -0.6611
+# Final valid prob (xent)       -0.7393
+
+# head exp/chain/tdnn_1b/decode_test_rnnlm_1e_2_0.40/scoring_kaldi/best_wer
+# WER 16.58 [ 11549 / 69668, 1290 ins, 2389 del, 7870 sub ] exp/chain/tdnn1c_swbd_sp/decode_test_rnnlm_1e_2_0.40/wer_10_0.5
+
+# exp/chain/tdnn_1b/: num-iters=441 nj=3..16 num-params=16.5M dim=40+100->1792 combine=-0.081->-0.081 (over 6) xent:train/valid[293,440,final]=(-0.937,-0.659,-0.661/-0.960,-0.739,-0.739) logprob:train/valid[293,440,final]=(-0.124,-0.083,-0.083/-0.127,-0.100,-0.099)
+
+set -e -o pipefail
+stage=0
+nj=30
+train_set=train
+test_set=test
+gmm=tri2b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=_1b   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training chunk-options
+chunk_width=150,110,100
+get_egs_stage=-10
+
+# training options
+srand=0
+remove_egs=true
+run_ivector_common=true
+run_chain_common=true
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+if $run_ivector_common; then
+  local/nnet3/run_ivector_common.sh \
+    --stage $stage --nj $nj \
+    --train-set $train_set --gmm $gmm \
+    --num-threads-ubm $num_threads_ubm \
+    --nnet3-affix "$nnet3_affix"
+fi
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+# Please take this as a reference on how to specify all the options of
+# local/chain/run_chain_common.sh
+if $run_chain_common; then
+  local/chain/run_chain_common.sh --stage $stage \
+                                  --gmm-dir $gmm_dir \
+                                  --ali-dir $ali_dir \
+                                  --lores-train-data-dir ${lores_train_data_dir} \
+                                  --lang $lang \
+                                  --lat-dir $lat_dir \
+                                  --num-leaves 7000 \
+                                  --tree-dir $tree_dir || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs 6 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.stage $get_egs_stage \
+    --cleanup.remove-egs=$remove_egs \
+    --feat-dir=$train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir=$lat_dir \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang \
+    $tree_dir $tree_dir/graph || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+    steps/nnet3/decode.sh \
+      --acwt 1.0 --post-decode-acwt 10.0 \
+      --extra-left-context 0 --extra-right-context 0 \
+      --extra-left-context-initial 0 \
+      --extra-right-context-final 0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
+      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test_set}_hires \
+      $tree_dir/graph data/${test_set}_hires ${dir}/decode_${test_set} || exit 1
+fi
diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh b/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh
deleted file mode 100755
index 0125272d06c..00000000000
--- a/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash 
-
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-
-galeData=$(utils/make_absolute.sh "${@: -1}" );  # last argumnet; the local folder
-audio_dvds=${@:1:${#}-1} # all the audio dvds for GALE corpus; ; check audio=( in ../run.sh
-
-mkdir -p $galeData 
-
-# check that sox is installed 
-which sox  &>/dev/null
-if [[ $? != 0 ]]; then 
- echo "sox is not installed"; exit 1 
-fi
-
-for dvd in $audio_dvds; do
-  dvd_full_path=$(utils/make_absolute.sh $dvd)
-  if [[ ! -e $dvd_full_path ]]; then 
-    echo missing $dvd_full_path; exit 1;
-  fi
-  find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \)  | while read file; do
-    id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}')
-    echo "$id sox $file -r 16000 -t wav - |"
-  done 
-done | sort -u > $galeData/wav.scp
-
-echo data prep audio succeded
-
-exit 0
-
diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_split.sh b/egs/gale_arabic/s5b/local/gale_data_prep_split.sh
deleted file mode 100755
index b18a4e5b105..00000000000
--- a/egs/gale_arabic/s5b/local/gale_data_prep_split.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash 
-
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-if [ $# -ne 1 ]; then
-   echo "Arguments should be the <gale folder>"; exit 1
-fi
-
-
-#data will data/local
-
-galeData=$(utils/make_absolute.sh $1)
-mkdir -p data/local
-dir=$(utils/make_absolute.sh data/local)
-
-
-grep -f local/test_list $galeData/all | grep -v -f local/bad_segments > $galeData/all.test
-grep -v -f local/test_list $galeData/all | grep -v -f local/bad_segments > $galeData/all.train 
-
-for x in test train; do
- outdir=$dir/$x
- file=$galeData/all.$x 
- mkdir -p $outdir
- awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk 
- cp -pr $outdir/utt2spk $outdir/spk2utt
- awk '{print $2 " " $1 " " $3 " " $4}' $file  | sort -u > $outdir/segments
- awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
-done 
-
-
-grep -f local/test_list $galeData/wav.scp > $dir/test/wav.scp
-
-cat $galeData/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
- {if (seen[$1]) { print $0}}' > $dir/train/wav.scp
- 
-echo data prep split succeeded
-
-exit 0
diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh b/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh
deleted file mode 100755
index 04529d88ac0..00000000000
--- a/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-galeData=$(utils/make_absolute.sh "${@: -1}" );  # last argumnet; the local folder
-txt_dvds=${@:1:${#}-1} # all the txt cds correspoding to the audio corpus; check text=( in ../run.sh
-
-
-top_pwd=`pwd`
-txtdir=$galeData/txt
-mkdir -p $txtdir; cd $txtdir
-
-for cdx in $txt_dvds; do
-  echo "Preparing $cdx"
-  if [[ $cdx  == *.tgz ]] ; then
-     tar -xvf $cdx
-  elif [  -d "$cdx" ]; then
-    ln -s $cdx `basename $cdx`
-  else
-    echo "I don't really know what I shall do with $cdx " >&2
-  fi
-done
-
-find -L . -type f -name "*.tdf" | while read file; do
-sed '1,3d' $file  # delete the first 3 lines
-done >  all.tmp$$
-
-perl -e '
-    ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0];
-    open(IN, "$inFile");
-    open(ID, ">$idFile");
-    open(TXT, ">$txtFile");
-    while (<IN>) {
-      @arr= split /\t/,$_;
-      $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
-      $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//;
-      if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";}
-      $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n";
-      next if ($rStart == $rEnd);
-      $id =~ s/.sph//g;
-      print ID $id;
-      print TXT "$arr[7]\n";
- }' "all.tmp$$ allid.tmp$$ contentall.tmp$$"
-
-
-perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$
-
-paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}'  > all_1.tmp$$
-
-awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $galeData/all
-awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' >  $galeData/report
-awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $galeData/conversational
-
-cd ..;
-rm -fr $txtdir
-cd $top_pwd
-echo data prep text succeeded
-
-exit 0
diff --git a/egs/gale_arabic/s5b/local/gale_format_data.sh b/egs/gale_arabic/s5b/local/gale_format_data.sh
deleted file mode 100755
index b69c34e68b9..00000000000
--- a/egs/gale_arabic/s5b/local/gale_format_data.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-if [ -f path.sh ]; then
-  . ./path.sh; else
-   echo "$0: missing path.sh"; exit 1;
-fi
-
-for dir in test train; do
-   cp -pr data/local/$dir data/$dir
-done
-
-
-mkdir -p data/lang_test
-
-arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
-[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
-
-rm -r data/lang_test
-cp -r data/lang data/lang_test
-
-gunzip -c "$arpa_lm" | \
-  arpa2fst --disambig-symbol=#0 \
-           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
-
-
-echo  "$0: Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst
-
-## Check lexicon.
-## just have a look and make sure it seems sane.
-echo "$0: First few lines of lexicon FST:"
-fstprint   --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst  | head
-
-echo "$0: Performing further checks"
-
-# Checking that G.fst is determinizable.
-fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
-
-# Checking that L_disambig.fst is determinizable.
-fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
-
-# Checking that disambiguated lexicon times G is determinizable
-# Note: we do this with fstdeterminizestar not fstdeterminize, as
-# fstdeterminize was taking forever (presumbaly relates to a bug
-# in this version of OpenFst that makes determinization slow for
-# some case).
-fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
-   fstdeterminizestar >/dev/null || echo Error
-
-# Checking that LG is stochastic:
-fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
-   fstisstochastic || echo LG is not stochastic
-
-
-echo gale_format_data succeeded.
-
-exit 0
diff --git a/egs/gale_arabic/s5b/local/gale_train_lms.sh b/egs/gale_arabic/s5b/local/gale_train_lms.sh
deleted file mode 100755
index 3988ec3818f..00000000000
--- a/egs/gale_arabic/s5b/local/gale_train_lms.sh
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/bin/bash
-
-
-# To be run from one directory above this script.
-
-
-lexicon=data/local/dict/lexicon.txt 
-[ ! -f $lexicon ] && echo "$0: No such file $lexicon" && exit 1;
-
-
-# This script takes no arguments.  It assumes you have already run
-# previus steps successfully
-# It takes as input the files
-#data/local/train.*/text
-#data/local/dict/lexicon.txt
-
-
-export LC_ALL=C # You'll get errors about things being not sorted, if you
-# have a different locale.
-export PATH=$PATH:./../../../tools/kaldi_lm
-( # First make sure the kaldi_lm toolkit is installed.
- cd $KALDI_ROOT/tools || exit 1;
- if [ -d kaldi_lm ]; then
-   echo Not installing the kaldi_lm toolkit since it is already there.
- else
-   echo Downloading and installing the kaldi_lm tools
-   if [ ! -f kaldi_lm.tar.gz ]; then
-     wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
-   fi
-   tar -xvzf kaldi_lm.tar.gz || exit 1;
-   cd kaldi_lm
-   make || exit 1;
-   echo Done making the kaldi_lm tools
- fi
-) || exit 1;
-
-
-dir=data/local/lm
- mkdir -p $dir
- text=data/local/train/text
- [ ! -f $text ] && echo "$0: No such file $text" && exit 1;
- 
- cleantext=$dir/text.no_oov
-
- cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } } 
-   {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<UNK> ",$n);} } printf("\n");}' \
-   > $cleantext || exit 1;
-
-
- cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
-    sort -nr > $dir/word.counts || exit 1;
-
-
-# Get counts from acoustic training transcripts, and add  one-count
-# for each word in the lexicon (but not silence, we don't want it
-# in the LM-- we'll add it optionally later).
- cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
-   cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
-    sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
-
-# note: we probably won't really make use of <UNK> as there aren't any OOVs
- cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map \
-    || exit 1;
-
-# note: ignore 1st field of train.txt, it's the utterance-id.
- cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
-   { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
-    || exit 1;
- 
- train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
-
-# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
-# Perplexity over 128254.000000 words is 90.446690
-
-# note: output is
-# data/local/lm/3gram-mincount/lm_unpruned.gz 
-
-
-echo train lm succeeded
-
-exit 0 
diff --git a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
index f14c8441869..5dc0818393b 100755
--- a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
+++ b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
@@ -2,31 +2,29 @@
 
 set -e -o pipefail
 
-# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually
-# be called by more scripts).  It contains the common feature preparation and iVector-related parts
-# of the script.  See those scripts for examples of usage.
+# This script is called from scripts like local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more scripts).  It
+# contains the common feature preparation and iVector-related parts of the
+# script.  See those scripts for examples of usage.
 
 
 stage=0
 nj=100
-min_seg_len=1.55  # min length in seconds... we do this because chain training
-                  # will discard segments shorter than 1.5 seconds.   Must remain in sync
-                  # with the same option given to prepare_lores_feats_and_alignments.sh
 train_set=train   # you might set this to e.g. train.
-gmm=tri2b         # This specifies a GMM-dir from the features of the type you're training the system on;
+test_sets="test"
+gmm=tri2b # This specifies a GMM-dir from the features of the type you're training the system on;
                          # it should contain alignments for 'train_set'.
 
 num_threads_ubm=32
-nnet3_affix=_cleaned     # affix for exp/nnet3 directory to put iVector stuff in, so it
-                         # becomes exp/nnet3_cleaned or whatever.
+nnet3_affix= # affix for exp/nnet3 directory to put iVector stuff
 
 . ./cmd.sh
 . ./path.sh
-. ./utils/parse_options.sh
+. utils/parse_options.sh
 
 
 gmm_dir=exp/${gmm}
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+ali_dir=exp/${gmm}_ali_${train_set}_sp
 
 for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
   if [ ! -f $f ]; then
@@ -61,7 +59,7 @@ if [ $stage -le 2 ]; then
     utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
   fi
 
-  for datadir in ${train_set}_sp test; do
+  for datadir in ${train_set}_sp ${test_sets}; do
     utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
   done
 
@@ -69,7 +67,7 @@ if [ $stage -le 2 ]; then
   # features; this helps make trained nnets more invariant to test data volume.
   utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
 
-  for datadir in ${train_set}_sp test; do
+  for datadir in ${train_set}_sp ${test_sets}; do
     steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
       --cmd "$train_cmd" data/${datadir}_hires
     steps/compute_cmvn_stats.sh data/${datadir}_hires
@@ -78,75 +76,33 @@ if [ $stage -le 2 ]; then
 fi
 
 if [ $stage -le 3 ]; then
-  echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data"
-  # we have to combine short segments or we won't be able to train chain models
-  # on those segments.
-  utils/data/combine_short_segments.sh \
-     data/${train_set}_sp_hires $min_seg_len data/${train_set}_sp_hires_comb
-
-  # just copy over the CMVN to avoid having to recompute it.
-  cp data/${train_set}_sp_hires/cmvn.scp data/${train_set}_sp_hires_comb/
-  utils/fix_data_dir.sh data/${train_set}_sp_hires_comb/
-fi
-
-if [ $stage -le 4 ]; then
-  echo "$0: selecting segments of hires training data that were also present in the"
-  echo " ... original training data."
-
-  # note, these data-dirs are temporary; we put them in a sub-directory
-  # of the place where we'll make the alignments.
-  temp_data_root=exp/nnet3${nnet3_affix}/tri5
-  mkdir -p $temp_data_root
-
-  utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \
-          data/${train_set}_sp_hires $temp_data_root/${train_set}_hires
-
-  # note: essentially all the original segments should be in the hires data.
-  n1=$(wc -l <data/${train_set}/feats.scp)
-  n2=$(wc -l <$temp_data_root/${train_set}_hires/feats.scp)
-  if [ $n1 != $n2 ]; then
-    echo "$0: warning: number of feats $n1 != $n2, if these are very different it could be bad."
-  fi
-
-  echo "$0: training a system on the hires data for its LDA+MLLT transform, in order to produce the diagonal GMM."
-  if [ -e exp/nnet3${nnet3_affix}/tri5/final.mdl ]; then
-    # we don't want to overwrite old stuff, ask the user to delete it.
-    echo "$0: exp/nnet3${nnet3_affix}/tri5/final.mdl already exists: "
-    echo " ... please delete and then rerun, or use a later --stage option."
-    exit 1;
-  fi
-  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 7 --mllt-iters "2 4 6" \
-     --splice-opts "--left-context=3 --right-context=3" \
-     3000 10000 $temp_data_root/${train_set}_hires data/lang \
-      $gmm_dir exp/nnet3${nnet3_affix}/tri5
-fi
-
-
-if [ $stage -le 5 ]; then
   echo "$0: computing a subset of data to train the diagonal UBM."
-
   mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
   temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
 
   # train a diagonal UBM using a subset of about a quarter of the data
-  # we don't use the _comb data for this as there is no need for compatibility with
-  # the alignments, and using the non-combined data is more efficient for I/O
-  # (no messing about with piped commands).
   num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
   num_utts=$[$num_utts_total/4]
   utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
       $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
 
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
   echo "$0: training the diagonal UBM."
   # Use 512 Gaussians in the UBM.
   steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
     --num-frames 700000 \
     --num-threads $num_threads_ubm \
     ${temp_data_root}/${train_set}_sp_hires_subset 512 \
-    exp/nnet3${nnet3_affix}/tri5 exp/nnet3${nnet3_affix}/diag_ubm
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
 fi
 
-if [ $stage -le 6 ]; then
+if [ $stage -le 4 ]; then
   # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
   # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
   # 100.
@@ -155,54 +111,55 @@ if [ $stage -le 6 ]; then
     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
 fi
 
-if [ $stage -le 7 ]; then
+if [ $stage -le 5 ]; then
   # note, we don't encode the 'max2' in the name of the ivectordir even though
   # that's the data we extract the ivectors from, as it's still going to be
-  # valid for the non-'max2' data, the utterance list is the same.
-  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+  # valid for the non-'max2' data; the utterance list is the same.
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
     utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
   fi
-  # We extract iVectors on the speed-perturbed training data after combining
-  # short segments, which will be what we train the system on.  With
+  # We extract iVectors on the speed-perturbed training data .  With
   # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
   # each of these pairs as one speaker; this gives more diversity in iVectors..
-  # Note that these are extracted 'online'.
+  # Note that these are extracted 'online' (they vary within the utterance).
 
-  # having a larger number of speakers is helpful for generalization, and to
-  # handle per-utterance decoding well (iVector starts at zero).
+  # Having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (the iVector starts at zero at the beginning
+  # of each pseudo-speaker).
   temp_data_root=${ivectordir}
   utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
-    data/${train_set}_sp_hires_comb ${temp_data_root}/${train_set}_sp_hires_comb_max2
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
 
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
-    ${temp_data_root}/${train_set}_sp_hires_comb_max2 \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
     exp/nnet3${nnet3_affix}/extractor $ivectordir
 
   # Also extract iVectors for the test data, but in this case we don't need the speed
-  # perturbation (sp) or small-segment concatenation (comb).
-  for data in test; do
-    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \
+  # perturbation (sp).
+  for data in ${test_sets}; do
+    nspk=$(wc -l <data/${data}_hires/spk2utt)
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
       data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
       exp/nnet3${nnet3_affix}/ivectors_${data}_hires
   done
 fi
 
-if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 9 ]; then
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 7 ]; then
   echo "$0: $feats already exists.  Refusing to overwrite the features "
   echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
   exit 1;
 fi
 
 
-if [ $stage -le 8 ]; then
+if [ $stage -le 6 ]; then
   echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
   utils/data/perturb_data_dir_speed_3way.sh \
     data/${train_set} data/${train_set}_sp
 fi
 
-if [ $stage -le 9 ]; then
-  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+if [ $stage -le 7 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)"
   steps/make_mfcc.sh --nj $nj \
     --cmd "$train_cmd" data/${train_set}_sp
   steps/compute_cmvn_stats.sh data/${train_set}_sp
@@ -211,26 +168,15 @@ if [ $stage -le 9 ]; then
   utils/fix_data_dir.sh data/${train_set}_sp
 fi
 
-if [ $stage -le 10 ]; then
-  echo "$0: combining short segments of low-resolution speed-perturbed  MFCC data"
-  src=data/${train_set}_sp
-  dest=data/${train_set}_sp_comb
-  utils/data/combine_short_segments.sh $src $min_seg_len $dest
-  # re-use the CMVN stats from the source directory, since it seems to be slow to
-  # re-compute them after concatenating short segments.
-  cp $src/cmvn.scp $dest/
-  utils/fix_data_dir.sh $dest
-fi
-
-if [ $stage -le 11 ]; then
+if [ $stage -le 8 ]; then
   if [ -f $ali_dir/ali.1.gz ]; then
     echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
     echo " ... or use a later --stage option."
     exit 1
   fi
-  echo "$0: aligning with the perturbed, short-segment-combined low-resolution data"
+  echo "$0: aligning with the perturbed low-resolution data"
   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-         data/${train_set}_sp_comb data/lang $gmm_dir $ali_dir
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir
 fi
 
 
diff --git a/egs/gale_arabic/s5b/local/prepare_data.sh b/egs/gale_arabic/s5b/local/prepare_data.sh
new file mode 100755
index 00000000000..1561928bb48
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/prepare_data.sh
@@ -0,0 +1,105 @@
+#!/bin/bash 
+
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+dir1=/export/corpora/LDC/LDC2013S02/
+dir2=/export/corpora/LDC/LDC2013S07/
+dir3=/export/corpora/LDC/LDC2014S07/
+text1=/export/corpora/LDC/LDC2013T17/
+text2=/export/corpora/LDC/LDC2013T04/
+text3=/export/corpora/LDC/LDC2014T17/
+gale_data=GALE
+
+mkdir -p $gale_data 
+# check that sox is installed 
+which sox  &>/dev/null
+if [[ $? != 0 ]]; then 
+ echo "sox is not installed"; exit 1 
+fi
+
+for dvd in $dir1 $dir2 $dir3; do
+  dvd_full_path=$(utils/make_absolute.sh $dvd)
+  if [[ ! -e $dvd_full_path ]]; then 
+    echo missing $dvd_full_path; exit 1;
+  fi
+  find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \)  | while read file; do
+    id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}')
+    echo "$id sox $file -r 16000 -t wav - |"
+  done 
+done | sort -u > $gale_data/wav.scp
+echo data prep audio succeded
+
+gale_data=$(utils/make_absolute.sh "GALE" );
+top_pwd=`pwd`
+txtdir=$gale_data/txt
+mkdir -p $txtdir; cd $txtdir
+
+for cdx in $text1 $text2 $text3; do
+  echo "Preparing $cdx"
+  if [[ $cdx  == *.tgz ]] ; then
+     tar -xvf $cdx
+  elif [  -d "$cdx" ]; then
+    ln -s $cdx `basename $cdx`
+  else
+    echo "I don't really know what I shall do with $cdx " >&2
+  fi
+done
+
+find -L . -type f -name "*.tdf" | while read file; do
+sed '1,3d' $file  # delete the first 3 lines
+done >  all.tmp$$
+
+perl -e '
+    ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0];
+    open(IN, "$inFile");
+    open(ID, ">$idFile");
+    open(TXT, ">$txtFile");
+    while (<IN>) {
+      @arr= split /\t/,$_;
+      $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
+      $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//;
+      if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";}
+      $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n";
+      next if ($rStart == $rEnd);
+      $id =~ s/.sph//g;
+      print ID $id;
+      print TXT "$arr[7]\n";
+ }' "all.tmp$$ allid.tmp$$ contentall.tmp$$"
+
+perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$
+paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}'  > all_1.tmp$$
+
+
+awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/all
+awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' >  $gale_data/report
+awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/conversational
+
+cd ..;
+rm -fr $txtdir
+cd $top_pwd
+echo data prep text succeeded
+
+mkdir -p data
+dir=$(utils/make_absolute.sh data/)
+grep -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.test
+grep -v -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.train 
+
+for x in test train; do
+ outdir=data/$x
+ file=$gale_data/all.$x 
+ mkdir -p $outdir
+ awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk 
+ cp -pr $outdir/utt2spk $outdir/spk2utt
+ awk '{print $2 " " $1 " " $3 " " $4}' $file  | sort -u > $outdir/segments
+ awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
+done 
+
+grep -f local/test_list $gale_data/wav.scp > $dir/test/wav.scp
+
+cat $gale_data/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
+ {if (seen[$1]) { print $0}}' > $dir/train/wav.scp
+ 
+echo data prep split succeeded
+
+exit 0
diff --git a/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh b/egs/gale_arabic/s5b/local/prepare_dict.sh
similarity index 61%
rename from egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh
rename to egs/gale_arabic/s5b/local/prepare_dict.sh
index 5f101f8245b..abaf8177f77 100755
--- a/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh
+++ b/egs/gale_arabic/s5b/local/prepare_dict.sh
@@ -3,39 +3,31 @@
 # Copyright 2017 QCRI (author: Ahmed Ali)
 # Apache 2.0
 
-
-# run this from ../
+mkdir -p data/local/dict
 dir=$(utils/make_absolute.sh data/local/dict)
-mkdir -p $dir
-
 
-# (1) Get all avaialble  dictionaries, since this is a grapheme model, so we mainly need the most frequent word lists
 wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2  || exit 1;
 wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2  || exit 1;
 bzcat ar-ar_grapheme_lexicon_2016-02-09.bz2  | sed '1,3d' | awk '{print $1}'  >  tmp$$
 bzcat ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >>  tmp$$
-# (2) Now we add all the words appeared in the training data
-cat data/local/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> tmp$$
+
+cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> tmp$$
 grep -v [0-9] tmp$$ |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > tmp1.$$ # remove vowels and  rare alef wasla
 cat tmp1.$$ | sed 's:\(\):\1 :g' | sed -e 's:  : :g' -e 's:  : :g' -e 's:\s*: :g' -e  's:\*:V:g' > tmp2.$$
 paste -d ' ' tmp1.$$ tmp2.$$ > $dir/lexicon.txt 
 
-#(2) Dictionary preparation:
+sed -i '1i<UNK> SIL' $dir/lexicon.txt
 
-# silence phones, one per line.
 echo SIL > $dir/silence_phones.txt
-echo SIL > $dir/optional_silence.txt
 
-# nonsilence phones; on each line is a list of phones that correspond
-# really to the same base phone.
-cat tmp2.$$ | tr -s ' ' '\n' | grep -v ^$  | sort -u >  $dir/nonsilence_phones.txt || exit 1;
+echo SIL >$dir/optional_silence.txt
 
-sed -i '1i<UNK> SIL' $dir/lexicon.txt # insert word <UNK> with phone sil at the begining of the dictionary
+echo -n "" >$dir/extra_questions.txt
+
+cat tmp2.$$ | tr -s ' ' '\n' | grep -v ^$  | sort -u >  $dir/nonsilence_phones.txt || exit 1;
 
 rm -fr ar-ar_lexicon_2014-03-17.txt.bz2 ar-ar_grapheme_lexicon_2016-02-09.bz2 tmp$$ tmp1.$$ tmp2.$$ 
-echo Dictionary preparation succeeded
 
-# The script is still missing dates and numbers 
+echo Dictionary preparation succeeded
 
 exit 0 
-
diff --git a/egs/gale_arabic/s5b/local/prepare_lm.sh b/egs/gale_arabic/s5b/local/prepare_lm.sh
new file mode 100755
index 00000000000..571ae1200df
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/prepare_lm.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Copyright 2012  Vassil Panayotov
+#           2017  Ewald Enzinger
+# Apache 2.0
+
+. ./path.sh || exit 1
+
+echo "=== Building a language model ..."
+
+locdata=data/local/lm/
+mkdir -p $locdata
+
+# Language model order
+order=3
+
+. utils/parse_options.sh
+
+# Prepare a LM training corpus from the transcripts
+mkdir -p $locdata
+
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 
+  else
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+cat data/train/text | cut -d " " -f 2- >  $locdata/train.txt
+
+ngram-count -text $locdata/train.txt -order $order -interpolate \
+  -kndiscount -lm $locdata/lm.gz
+
+#ngram -lm $locdata/lm.gz -ppl $locdata/dev.txt
+echo "*** Finished building the LM model!"
diff --git a/egs/gale_arabic/s5b/local/score.sh b/egs/gale_arabic/s5b/local/score.sh
index 83366f7c7fc..1d84815fc69 100755
--- a/egs/gale_arabic/s5b/local/score.sh
+++ b/egs/gale_arabic/s5b/local/score.sh
@@ -1,60 +1,6 @@
-#!/bin/bash
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
-# Apache 2.0
-
-[ -f ./path.sh ] && . ./path.sh
-
-# begin configuration section.
-cmd=run.pl
-stage=0
-decode_mbr=true
-word_ins_penalty=0.0
-min_lmwt=7
-max_lmwt=17
-iter=  #some of the scripts from steps/ seem to use it
-#end configuration section.
-
-echo "$0 $#"
-
-[ -f ./path.sh ] && . ./path.sh
-. parse_options.sh || exit 1;
-
-if [ $# -ne 3 ]; then
-  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
-  echo " Options:"
-  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
-  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
-  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
-  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
-  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
-  exit 1;
-fi
 
-data=$1
-lang_or_graph=$2
-dir=$3
-
-symtab=$lang_or_graph/words.txt
-
-for f in $symtab $dir/lat.1.gz $data/text; do
-  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
-done
-
-mkdir -p $dir/scoring/log
-
-cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
-
-$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
-  lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
-  lattice-add-penalty --word-ins-penalty=$word_ins_penalty ark:- ark:- \| \
-  lattice-best-path --word-symbol-table=$symtab \
-    ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
+#!/bin/bash
 
-# Note: the double level of quoting for the sed command
-$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
-   cat $dir/scoring/LMWT.tra \| \
-    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
-    compute-wer --text --mode=present \
-     ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT || exit 1;
 
-exit 0;
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/gale_arabic/s5b/run.sh b/egs/gale_arabic/s5b/run.sh
index c45f5119949..bbb6349fea8 100755
--- a/egs/gale_arabic/s5b/run.sh
+++ b/egs/gale_arabic/s5b/run.sh
@@ -3,177 +3,110 @@
 # Copyright 2014 QCRI (author: Ahmed Ali)
 # Apache 2.0
 
-. ./path.sh
-. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
-           ## This relates to the queue.
 num_jobs=120
 num_decode_jobs=40
+decode_gmm=false
+stage=0
+overwrite=false
 
-#NB: You can add whatever number of copora you like. The supported extensions 
-#NB: (formats) are wav and flac. Flac will be converted using sox and in contrast
-#NB: with the old approach, the conversion will be on-the-fly and one-time-only
-#NB: during the parametrization.
-
-#NB: Text corpora scpecification. We support either tgz files, which are unpacked
-#NB: or just plain (already unpacked) directories. The list of transcript is then
-#NB: obtained using find command
-
-#Make sure you edit this section to reflect whers you keep the LDC data on your cluster
-
-#This is CLSP configuration. We add the 2014 GALE data. We got around 2 % 
-#improvement just by including it. The gain might be large if someone would tweak
-# the number of leaves and states and so on.
-
-#audio=(
-#  /export/corpora/LDC/LDC2013S02/
-#  /export/corpora/LDC/LDC2013S07/
-#  /export/corpora/LDC/LDC2014S07/
-#)
-#text=(
-#  /export/corpora/LDC/LDC2013T17
-#  /export/corpora/LDC/LDC2013T04
-#  /export/corpora/LDC/LDC2014T17
-#)
-
-audio=(
-  /data/sls/scratch/amali/data/GALE/LDC2013S02
-  /data/sls/scratch/amali/data/GALE/LDC2013S07
-  /data/sls/scratch/amali/data/GALE/LDC2014S07
-)
-text=(
-  /data/sls/scratch/amali/data/GALE/LDC2013T17.tgz
-  /data/sls/scratch/amali/data/GALE/LDC2013T04.tgz
-  /data/sls/scratch/amali/data/GALE/LDC2014T17.tgz
-)
+dir1=/export/corpora/LDC/LDC2013S02/
+dir2=/export/corpora/LDC/LDC2013S07/
+dir3=/export/corpora/LDC/LDC2014S07/
+text1=/export/corpora/LDC/LDC2013T17/
+text2=/export/corpora/LDC/LDC2013T04/
+text3=/export/corpora/LDC/LDC2014T17/
 
 galeData=GALE
-#prepare the data
-#split train dev test 
-#prepare lexicon and LM 
-
-# You can run the script from here automatically, but it is recommended to run the data preparation,
-# and features extraction manually and and only once.
-# By copying and pasting into your shell.
-
-#copy the audio files to local folder wav and convet flac files to wav
-local/gale_data_prep_audio.sh  "${audio[@]}" $galeData || exit 1;
-
-#get the transcription and remove empty prompts and all noise markers  
-local/gale_data_prep_txt.sh  "${text[@]}" $galeData || exit 1;
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+. ./utils/parse_options.sh  # e.g. this parses the above options
+                            # if supplied.
 
-# split the data to reports and conversational and for each class will have rain/dev and test
-local/gale_data_prep_split.sh $galeData  || exit 1;
+if [ $stage -le 0 ]; then
 
-# get all Arabic grapheme dictionaries and add silence and UNK
-local/gale_prep_grapheme_dict.sh  || exit 1;
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
 
+  echo "$0: Preparing data..."
+  local/prepare_data.sh --dir1 $dir1 --dir2 $dir2 --dir3 $dir3 \
+                        --text1 $text1 --text2 $text2 --text3 $text3
+ 
+  local/prepare_dict.sh
 
-#prepare the langauge resources
-utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang   || exit 1;
+  utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
 
-# LM training
-local/gale_train_lms.sh || exit 1;
+  local/prepare_lm.sh
 
-local/gale_format_data.sh  || exit 1;
-# G compilation, check LG composition
+  utils/format_lm.sh data/lang data/local/lm/lm.gz \
+                     data/local/dict/lexicon.txt data/lang
+fi
 
 # Now make MFCC features.
 # mfccdir should be some place with a largish disk where you
 # want to store MFCC features.
 mfccdir=mfcc
-
-for x in train test ; do
-  steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \
-    data/$x exp/make_mfcc/$x $mfccdir
-  utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
-  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
-done
-
+if [ $stage -le 1 ]; then
+  echo "$0: Preparing the test and train feature files..."
+  for x in train test ; do
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \
+      data/$x exp/make_mfcc/$x $mfccdir
+    utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+  done
+fi
 
 # Here we start the AM
-
-# Let's create a subset with 10k segments to make quick flat-start training:
-utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1;
-
-# Train monophone models on a subset of the data, 10K segment
-# Note: the --boost-silence option should probably be omitted by default
-steps/train_mono.sh --nj 40 --cmd "$train_cmd" \
-  data/train.10K data/lang exp/mono || exit 1;
-
-
-# Get alignments from monophone system.
-steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
-  data/train data/lang exp/mono exp/mono_ali || exit 1;
-
-# train tri1 [first triphone pass]
-steps/train_deltas.sh --cmd "$train_cmd" \
-  2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
-
-# First triphone decoding
-utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
-steps/decode.sh  --nj $num_decode_jobs --cmd "$decode_cmd" \
-  exp/tri1/graph data/test exp/tri1/decode
-  
-steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
-  data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
-
-# Train tri2a, which is deltas+delta+deltas
-steps/train_deltas.sh --cmd "$train_cmd" \
-  3000 40000 data/train data/lang exp/tri1_ali exp/tri2a || exit 1;
-
-# tri2a decoding
-utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph
-steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \
-  exp/tri2a/graph data/test exp/tri2a/decode
-
-# train and decode tri2b [LDA+MLLT]
-steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \
-  data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
-
-utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph
-steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \
-  exp/tri2b/graph data/test exp/tri2b/decode
-
-# Align all data with LDA+MLLT system (tri2b)
-steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
-  --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali  || exit 1;
-
-
-# From 2b system, train 3b which is LDA + MLLT + SAT.
-steps/train_sat.sh --cmd "$train_cmd" \
-  5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1;
-
-utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph
-steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \
-  "$decode_cmd" exp/tri3b/graph data/test exp/tri3b/decode
-
-# From 3b system, align all data.
-steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \
-  data/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
+if [ $stage -le 2 ]; then
+  # Let's create a subset with 10k segments to make quick flat-start training:
+  utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1;
+
+  # Train monophone models on a subset of the data, 10K segment
+  # Note: the --boost-silence option should probably be omitted by default
+  steps/train_mono.sh --nj 40 --cmd "$train_cmd" \
+    data/train.10K data/lang exp/mono || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+  # Get alignments from monophone system.
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/mono exp/mono_ali || exit 1;
   
-
-# nnet3 cross-entropy 
-local/nnet3/run_tdnn.sh #tdnn recipe:
-local/nnet3/run_lstm.sh --stage 12  #lstm recipe (we skip ivector training)
-
-# chain lattice-free 
-local/chain/run_tdnn.sh      #tdnn recipe:
-local/chain/run_tdnn_lstm.sh #tdnn-lstm recipe:
-
-time=$(date +"%Y-%m-%d-%H-%M-%S")
-
-#get detailed WER; reports, conversational and combined
-local/split_wer.sh $galeData > RESULTS.details.$USER.$time # to make sure you keep the results timed and owned
-
+  # train tri1 [first triphone pass]
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
+fi
+
+if [ $stage -le 4 ] && $decode_gmm; then
+  # First triphone decoding
+  utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
+  steps/decode.sh  --nj $num_decode_jobs --cmd "$decode_cmd" \
+    exp/tri1/graph data/test exp/tri1/decode
+fi
+
+if [ $stage -le 5 ]; then
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
+
+  # train and decode tri2b [LDA+MLLT]
+  steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \
+    data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
+fi
+
+if [ $stage -le 6 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph
+  steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \
+    exp/tri2b/graph data/test exp/tri2b/decode
+fi
+
+if [ $stage -le 7 ]; then
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali  || exit 1;
+
+  local/chain/run_tdnn.sh      #tdnn recipe:
+fi
 echo training succedded
 exit 0
-
-#TODO:
-#LM (4-gram and RNN) rescoring
-#combine lattices
-#dialect detection
-
-
-
-
-

From 7dcc6090ac383add2b3bc4bca667eee67fea3f73 Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Fri, 21 Dec 2018 20:30:48 -0500
Subject: [PATCH 02/13] minor update

---
 egs/gale_arabic/s5b/local/chain/run_tdnn.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/gale_arabic/s5b/local/chain/run_tdnn.sh b/egs/gale_arabic/s5b/local/chain/run_tdnn.sh
index 34499362831..61f8f499182 120000
--- a/egs/gale_arabic/s5b/local/chain/run_tdnn.sh
+++ b/egs/gale_arabic/s5b/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_1a.sh
\ No newline at end of file
+tuning/run_tdnn_1b.sh
\ No newline at end of file

From d75efb25f2ca4fe26dbe2644a0584e7876b3894a Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Wed, 26 Dec 2018 14:55:00 -0500
Subject: [PATCH 03/13] fixing script name

---
 egs/gale_arabic/s5b/local/chain/{copare_wer.sh => compare_wer.sh} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename egs/gale_arabic/s5b/local/chain/{copare_wer.sh => compare_wer.sh} (100%)

diff --git a/egs/gale_arabic/s5b/local/chain/copare_wer.sh b/egs/gale_arabic/s5b/local/chain/compare_wer.sh
similarity index 100%
rename from egs/gale_arabic/s5b/local/chain/copare_wer.sh
rename to egs/gale_arabic/s5b/local/chain/compare_wer.sh

From bee4ba275544a12cc72d4690825d202ae2f2afb4 Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Sun, 30 Dec 2018 20:49:50 -0500
Subject: [PATCH 04/13] fixing bug

---
 egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh
index caa0d9d805e..9e76130e7bd 100755
--- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh
@@ -100,7 +100,7 @@ if $run_chain_common; then
                                   --lores-train-data-dir ${lores_train_data_dir} \
                                   --lang $lang \
                                   --lat-dir $lat_dir \
-                                  --num-leaves 7000 \
+                                  --num-leaves 3500 \
                                   --tree-dir $tree_dir || exit 1;
 fi
 

From 715d4def57ec68e69ea615d9711b965021ed9b45 Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Sun, 30 Dec 2018 20:58:09 -0500
Subject: [PATCH 05/13] modification from review, todo: run tri3

---
 egs/gale_arabic/s5b/local/chain/run_tdnn.sh   |   2 +-
 .../s5b/local/chain/tuning/run_tdnn_1a.sh     | 288 +++++++++---------
 .../s5b/local/chain/tuning/run_tdnn_1b.sh     | 220 -------------
 egs/gale_arabic/s5b/run.sh                    |  27 +-
 4 files changed, 164 insertions(+), 373 deletions(-)
 delete mode 100755 egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh

diff --git a/egs/gale_arabic/s5b/local/chain/run_tdnn.sh b/egs/gale_arabic/s5b/local/chain/run_tdnn.sh
index 61f8f499182..34499362831 120000
--- a/egs/gale_arabic/s5b/local/chain/run_tdnn.sh
+++ b/egs/gale_arabic/s5b/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_1b.sh
\ No newline at end of file
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
index 7afafb31ff6..b5486decc31 100755
--- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -1,31 +1,52 @@
 #!/bin/bash
 
-#started from tedlium recipe with few edits
+# ./local/chain/compare_wer.sh exp/chain/tdnn_1b/
+# System                      tdnn_1b
+# WER                             17.23
+# CER                              6.83
+# Final train prob              -0.0825
+# Final valid prob              -0.0987
+# Final train prob (xent)       -0.6611
+# Final valid prob (xent)       -0.7393
 
+# head exp/chain/tdnn_1b/decode_test_rnnlm_1e_2_0.40/scoring_kaldi/best_wer
+# WER 16.58 [ 11549 / 69668, 1290 ins, 2389 del, 7870 sub ] exp/chain/tdnn1c_swbd_sp/decode_test_rnnlm_1e_2_0.40/wer_10_0.5
 
-set -e -o pipefail
+# exp/chain/tdnn_1b/: num-iters=441 nj=3..16 num-params=16.5M dim=40+100->1792 combine=-0.081->-0.081 (over 6) xent:train/valid[293,440,final]=(-0.937,-0.659,-0.661/-0.960,-0.739,-0.739) logprob:train/valid[293,440,final]=(-0.124,-0.083,-0.083/-0.127,-0.100,-0.099)
 
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
+set -e -o pipefail
 stage=0
 nj=30
-decode_nj=30
-min_seg_len=1.55
-xent_regularize=0.1
 train_set=train
-gmm=tri2b # the gmm for the target data
+test_set=test
+gmm=tri2b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
 num_threads_ubm=32
-nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-
-# The rest are configs specific to this script.  Most of the parameters
-# are just hardcoded at this level, in the commands below.
-train_stage=-10 #default -10
-tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_affix=1b  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=  # you can set this to use previously dumped egs.
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=_1a   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training chunk-options
+chunk_width=150,110,100
+get_egs_stage=-10
+
+# training options
+srand=0
+remove_egs=true
+run_ivector_common=true
+run_chain_common=true
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
+
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
@@ -39,169 +60,162 @@ where "nvcc" is installed.
 EOF
 fi
 
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                                  --nnet3-affix "$nnet3_affix"
-
-
-gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
-train_data_dir=data/${train_set}_sp_hires_comb
-lores_train_data_dir=data/${train_set}_sp_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
-
+if $run_ivector_common; then
+  local/nnet3/run_ivector_common.sh \
+    --stage $stage --nj $nj \
+    --train-set $train_set --gmm $gmm \
+    --num-threads-ubm $num_threads_ubm \
+    --nnet3-affix "$nnet3_affix"
+fi
 
-for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
 done
 
-if [ $stage -le 14 ]; then
-  echo "$0: creating lang directory with one state per phone."
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  if [ -d data/lang_chain ]; then
-    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
-      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
-    else
-      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
-      echo " ... not sure what to do.  Exiting."
-      exit 1;
-    fi
-  else
-    cp -r data/lang data/lang_chain
-    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
-    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
-    # Use our special topology... note that later on may have to tune this
-    # topology.
-    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
-  fi
+# Please take this as a reference on how to specify all the options of
+# local/chain/run_chain_common.sh
+if $run_chain_common; then
+  local/chain/run_chain_common.sh --stage $stage \
+                                  --gmm-dir $gmm_dir \
+                                  --ali-dir $ali_dir \
+                                  --lores-train-data-dir ${lores_train_data_dir} \
+                                  --lang $lang \
+                                  --lat-dir $lat_dir \
+                                  --num-leaves 3500 \
+                                  --tree-dir $tree_dir || exit 1;
 fi
 
 if [ $stage -le 15 ]; then
-  # Get the alignments as lattices (gives the chain training more freedom).
-  # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang $gmm_dir $lat_dir
-  rm $lat_dir/fsts.*.gz # save space
-fi
-
-if [ $stage -le 16 ]; then
-  # Build a tree using our new topology.  We know we have alignments for the
-  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
-  # those.
-  if [ -f $tree_dir/final.mdl ]; then
-    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
-    exit 1;
-  fi
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
-fi
-
-if [ $stage -le 17 ]; then
   mkdir -p $dir
-
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
 
   mkdir -p $dir/configs
+
   cat <<EOF > $dir/configs/network.xconfig
   input dim=100 name=ivector
   input dim=40 name=input
-
   # please note that it is important to have input layer with the name=input
   # as the layer immediately preceding the fixed-affine-layer to enable
   # the use of short notation for the descriptor
   fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
   # the first splicing is moved before the lda layer, so no splicing here
-  relu-renorm-layer name=tdnn1 dim=450
-  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450
-  relu-renorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=450
-  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450
-  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450
-  relu-renorm-layer name=tdnn6 input=Append(-6,-3,0) dim=450
-
-  ## adding the layers for chain branch
-  relu-renorm-layer name=prefinal-chain input=tdnn6 dim=450 target-rms=0.5
-  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
-
-  # adding the layers for xent branch
-  # This block prints the configs for a separate output that will be
-  # trained with a cross-entropy objective in the 'chain' models... this
-  # has the effect of regularizing the hidden parts of the model.  we use
-  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
-  # 0.5 / args.xent_regularize is suitable as it means the xent
-  # final-layer learns at a rate independent of the regularization
-  # constant; and the 0.5 was tuned so as to make the relative progress
-  # similar in the xent and regular final layers.
-  relu-renorm-layer name=prefinal-xent input=tdnn6 dim=450 target-rms=0.5
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
-
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
 EOF
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-
 fi
 
-if [ $stage -le 18 ]; then
+
+if [ $stage -le 16 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
   fi
 
- steps/nnet3/chain/train.py --stage $train_stage \
+  steps/nnet3/chain/train.py --stage $train_stage \
     --cmd "$decode_cmd" \
     --feat.online-ivector-dir $train_ivector_dir \
     --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
     --chain.xent-regularize $xent_regularize \
     --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.00005 \
+    --chain.l2-regularize 0.0 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width 150 \
-    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs 6 \
     --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.num-jobs-initial 2 \
-    --trainer.optimization.num-jobs-final 2 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs true \
-    --feat-dir $train_data_dir \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.stage $get_egs_stage \
+    --reporting.email="$reporting_email" \
+    --cleanup.remove-egs=$remove_egs \
+    --feat-dir=$train_data_dir \
     --tree-dir $tree_dir \
-    --lat-dir $lat_dir \
-    --dir $dir
-fi
-
+    --lat-dir=$lat_dir \
+    --dir $dir  || exit 1;
 
+fi
 
-if [ $stage -le 19 ]; then
-  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang \
+    $tree_dir $tree_dir/graph || exit 1;
 fi
 
-if [ $stage -le 20 ]; then
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   rm $dir/.error 2>/dev/null || true
-  steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
-    --acwt 1.0 --post-decode-acwt 10.0 \
-    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
-    --scoring-opts "--min-lmwt 5 " \
-    $dir/graph data/test_hires $dir/decode || exit 1;
+
+    steps/nnet3/decode.sh \
+      --acwt 1.0 --post-decode-acwt 10.0 \
+      --extra-left-context 0 --extra-right-context 0 \
+      --extra-left-context-initial 0 \
+      --extra-right-context-final 0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
+      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test_set}_hires \
+      $tree_dir/graph data/${test_set}_hires ${dir}/decode_${test_set} || exit 1
 fi
-exit 0
diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh
deleted file mode 100755
index 9e76130e7bd..00000000000
--- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1b.sh
+++ /dev/null
@@ -1,220 +0,0 @@
-#!/bin/bash
-
-# ./local/chain/compare_wer.sh exp/chain/tdnn_1b/
-# System                      tdnn_1b
-# WER                             17.23
-# CER                              6.83
-# Final train prob              -0.0825
-# Final valid prob              -0.0987
-# Final train prob (xent)       -0.6611
-# Final valid prob (xent)       -0.7393
-
-# head exp/chain/tdnn_1b/decode_test_rnnlm_1e_2_0.40/scoring_kaldi/best_wer
-# WER 16.58 [ 11549 / 69668, 1290 ins, 2389 del, 7870 sub ] exp/chain/tdnn1c_swbd_sp/decode_test_rnnlm_1e_2_0.40/wer_10_0.5
-
-# exp/chain/tdnn_1b/: num-iters=441 nj=3..16 num-params=16.5M dim=40+100->1792 combine=-0.081->-0.081 (over 6) xent:train/valid[293,440,final]=(-0.937,-0.659,-0.661/-0.960,-0.739,-0.739) logprob:train/valid[293,440,final]=(-0.124,-0.083,-0.083/-0.127,-0.100,-0.099)
-
-set -e -o pipefail
-stage=0
-nj=30
-train_set=train
-test_set=test
-gmm=tri2b        # this is the source gmm-dir that we'll use for alignments; it
-                 # should have alignments for the specified training data.
-num_threads_ubm=32
-nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
-
-# Options which are not passed through to run_ivector_common.sh
-affix=_1b   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
-common_egs_dir=
-reporting_email=
-
-# LSTM/chain options
-train_stage=-10
-xent_regularize=0.1
-dropout_schedule='0,0@0.20,0.5@0.50,0'
-
-# training chunk-options
-chunk_width=150,110,100
-get_egs_stage=-10
-
-# training options
-srand=0
-remove_egs=true
-run_ivector_common=true
-run_chain_common=true
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-if $run_ivector_common; then
-  local/nnet3/run_ivector_common.sh \
-    --stage $stage --nj $nj \
-    --train-set $train_set --gmm $gmm \
-    --num-threads-ubm $num_threads_ubm \
-    --nnet3-affix "$nnet3_affix"
-fi
-
-gmm_dir=exp/${gmm}
-ali_dir=exp/${gmm}_ali_${train_set}_sp
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
-dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
-train_data_dir=data/${train_set}_sp_hires
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
-lores_train_data_dir=data/${train_set}_sp
-
-# note: you don't necessarily have to change the treedir name
-# each time you do a new experiment-- only if you change the
-# configuration in a way that affects the tree.
-tree_dir=exp/chain${nnet3_affix}/tree_a_sp
-# the 'lang' directory is created by this script.
-# If you create such a directory with a non-standard topology
-# you should probably name it differently.
-lang=data/lang_chain
-
-for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
-    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-# Please take this as a reference on how to specify all the options of
-# local/chain/run_chain_common.sh
-if $run_chain_common; then
-  local/chain/run_chain_common.sh --stage $stage \
-                                  --gmm-dir $gmm_dir \
-                                  --ali-dir $ali_dir \
-                                  --lores-train-data-dir ${lores_train_data_dir} \
-                                  --lang $lang \
-                                  --lat-dir $lat_dir \
-                                  --num-leaves 3500 \
-                                  --tree-dir $tree_dir || exit 1;
-fi
-
-if [ $stage -le 15 ]; then
-  mkdir -p $dir
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
-  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
-  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
-  prefinal_opts="l2-regularize=0.01"
-  output_opts="l2-regularize=0.002"
-
-  mkdir -p $dir/configs
-
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
-  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
-  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
-  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
-  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
-  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
-  linear-component name=prefinal-l dim=256 $linear_opts
-  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
-  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
-  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-
-if [ $stage -le 16 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "$decode_cmd" \
-    --feat.online-ivector-dir $train_ivector_dir \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize $xent_regularize \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.0 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --trainer.dropout-schedule $dropout_schedule \
-    --trainer.srand=$srand \
-    --trainer.max-param-change=2.0 \
-    --trainer.num-epochs 6 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.optimization.num-jobs-initial 3 \
-    --trainer.optimization.num-jobs-final 16 \
-    --trainer.optimization.initial-effective-lrate 0.00025 \
-    --trainer.optimization.final-effective-lrate 0.000025 \
-    --trainer.num-chunk-per-minibatch=64,32 \
-    --trainer.add-option="--optimization.memory-compression-level=2" \
-    --egs.chunk-width=$chunk_width \
-    --egs.dir="$common_egs_dir" \
-    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
-    --egs.stage $get_egs_stage \
-    --cleanup.remove-egs=$remove_egs \
-    --feat-dir=$train_data_dir \
-    --tree-dir $tree_dir \
-    --lat-dir=$lat_dir \
-    --dir $dir  || exit 1;
-
-fi
-
-if [ $stage -le 17 ]; then
-  # The reason we are using data/lang here, instead of $lang, is just to
-  # emphasize that it's not actually important to give mkgraph.sh the
-  # lang directory with the matched topology (since it gets the
-  # topology file from the model).  So you could give it a different
-  # lang directory, one that contained a wordlist and LM of your choice,
-  # as long as phones.txt was compatible.
-
-  utils/lang/check_phones_compatible.sh \
-    data/lang_test/phones.txt $lang/phones.txt
-  utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang \
-    $tree_dir $tree_dir/graph || exit 1;
-fi
-
-if [ $stage -le 18 ]; then
-  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  rm $dir/.error 2>/dev/null || true
-
-    steps/nnet3/decode.sh \
-      --acwt 1.0 --post-decode-acwt 10.0 \
-      --extra-left-context 0 --extra-right-context 0 \
-      --extra-left-context-initial 0 \
-      --extra-right-context-final 0 \
-      --frames-per-chunk $frames_per_chunk \
-      --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
-      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test_set}_hires \
-      $tree_dir/graph data/${test_set}_hires ${dir}/decode_${test_set} || exit 1
-fi
diff --git a/egs/gale_arabic/s5b/run.sh b/egs/gale_arabic/s5b/run.sh
index bbb6349fea8..b2c10ec0a94 100755
--- a/egs/gale_arabic/s5b/run.sh
+++ b/egs/gale_arabic/s5b/run.sh
@@ -34,7 +34,8 @@ if [ $stage -le 0 ]; then
   echo "$0: Preparing data..."
   local/prepare_data.sh --dir1 $dir1 --dir2 $dir2 --dir3 $dir3 \
                         --text1 $text1 --text2 $text2 --text3 $text3
- 
+
+  echo "$0: Preparing lexicon and LM..." 
   local/prepare_dict.sh
 
   utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
@@ -45,9 +46,6 @@ if [ $stage -le 0 ]; then
                      data/local/dict/lexicon.txt data/lang
 fi
 
-# Now make MFCC features.
-# mfccdir should be some place with a largish disk where you
-# want to store MFCC features.
 mfccdir=mfcc
 if [ $stage -le 1 ]; then
   echo "$0: Preparing the test and train feature files..."
@@ -59,41 +57,41 @@ if [ $stage -le 1 ]; then
   done
 fi
 
-# Here we start the AM
 if [ $stage -le 2 ]; then
-  # Let's create a subset with 10k segments to make quick flat-start training:
+  echo "$0: creating sub-set and training monophone system"
   utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1;
 
-  # Train monophone models on a subset of the data, 10K segment
-  # Note: the --boost-silence option should probably be omitted by default
   steps/train_mono.sh --nj 40 --cmd "$train_cmd" \
     data/train.10K data/lang exp/mono || exit 1;
 fi
 
 if [ $stage -le 3 ]; then
-  # Get alignments from monophone system.
+  echo "$0: Aligning data using monophone system"
   steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
     data/train data/lang exp/mono exp/mono_ali || exit 1;
-  
-  # train tri1 [first triphone pass]
+
+  echo "$0: training triphone system with delta features"
   steps/train_deltas.sh --cmd "$train_cmd" \
     2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
 fi
 
 if [ $stage -le 4 ] && $decode_gmm; then
-  # First triphone decoding
   utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
   steps/decode.sh  --nj $num_decode_jobs --cmd "$decode_cmd" \
     exp/tri1/graph data/test exp/tri1/decode
 fi
 
 if [ $stage -le 5 ]; then
+  echo "$0: Aligning data and retraining and realigning with lda_mllt"
   steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
     data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
 
   # train and decode tri2b [LDA+MLLT]
   steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \
     data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
+
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali  || exit 1;
 fi
 
 if [ $stage -le 6 ] && $decode_gmm; then
@@ -103,10 +101,9 @@ if [ $stage -le 6 ] && $decode_gmm; then
 fi
 
 if [ $stage -le 7 ]; then
-  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
-    --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali  || exit 1;
-
+  echo "$0: Training a regular chain model using the e2e alignments..."
   local/chain/run_tdnn.sh      #tdnn recipe:
 fi
+
 echo training succedded
 exit 0

From 9c16689e98b9425e50004abf9045969eab10b00f Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Tue, 1 Jan 2019 22:19:46 -0500
Subject: [PATCH 06/13] modification from review

---
 egs/gale_arabic/s5b/local/prepare_dict.sh    | 51 ++++++++++++--------
 egs/gale_arabic/s5b/local/prepare_lexicon.py | 26 ++++++++++
 egs/gale_arabic/s5b/local/wer_output_filter  | 19 ++++++++
 3 files changed, 77 insertions(+), 19 deletions(-)
 create mode 100755 egs/gale_arabic/s5b/local/prepare_lexicon.py
 create mode 100755 egs/gale_arabic/s5b/local/wer_output_filter

diff --git a/egs/gale_arabic/s5b/local/prepare_dict.sh b/egs/gale_arabic/s5b/local/prepare_dict.sh
index abaf8177f77..5feef445357 100755
--- a/egs/gale_arabic/s5b/local/prepare_dict.sh
+++ b/egs/gale_arabic/s5b/local/prepare_dict.sh
@@ -1,33 +1,46 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 # Copyright 2017 QCRI (author: Ahmed Ali)
 # Apache 2.0
+# This script prepares the dictionary.
 
-mkdir -p data/local/dict
-dir=$(utils/make_absolute.sh data/local/dict)
+set -e
+dir=data/local/dict
+lexicon_url1="http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2";
+lexicon_url2="http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2";
+stage=0
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+mkdir -p $dir data/local/lexicon_data
 
-wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2  || exit 1;
-wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2  || exit 1;
-bzcat ar-ar_grapheme_lexicon_2016-02-09.bz2  | sed '1,3d' | awk '{print $1}'  >  tmp$$
-bzcat ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >>  tmp$$
+if [ $stage -le 0 ]; then
+  echo "$0: Downloading text for lexicon... $(date)."
+  wget -P data/local/lexicon_data $lexicon_url1
+  wget -P data/local/lexicon_data $lexicon_url2
+  bzcat data/local/lexicon_data/ar-ar_grapheme_lexicon_2016-02-09.bz2  | sed '1,3d' | awk '{print $1}'  >  data/local/lexicon_data/grapheme_lexicon
+  bzcat data/local/lexicon_data/ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >>  data/local/lexicon_data/grapheme_lexicon
+  cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> data/local/lexicon_data/grapheme_lexicon
+fi
 
-cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> tmp$$
-grep -v [0-9] tmp$$ |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > tmp1.$$ # remove vowels and  rare alef wasla
-cat tmp1.$$ | sed 's:\(\):\1 :g' | sed -e 's:  : :g' -e 's:  : :g' -e 's:\s*: :g' -e  's:\*:V:g' > tmp2.$$
-paste -d ' ' tmp1.$$ tmp2.$$ > $dir/lexicon.txt 
 
-sed -i '1i<UNK> SIL' $dir/lexicon.txt
+if [ $stage -le 0 ]; then
+  echo "$0: processing lexicon text and creating lexicon... $(date)."
+  # remove vowels and  rare alef wasla
+  grep -v [0-9] data/local/lexicon_data/grapheme_lexicon |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon
+  local/prepare_lexicon.py
+fi
 
-echo SIL > $dir/silence_phones.txt
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
 
-echo SIL >$dir/optional_silence.txt
+sed -i '1i<UNK> UNK' $dir/lexicon.txt
 
-echo -n "" >$dir/extra_questions.txt
+echo '<sil> SIL' >> $dir/lexicon.txt
 
-cat tmp2.$$ | tr -s ' ' '\n' | grep -v ^$  | sort -u >  $dir/nonsilence_phones.txt || exit 1;
+echo SIL > $dir/silence_phones.txt
 
-rm -fr ar-ar_lexicon_2014-03-17.txt.bz2 ar-ar_grapheme_lexicon_2016-02-09.bz2 tmp$$ tmp1.$$ tmp2.$$ 
+echo SIL >$dir/optional_silence.txt
 
-echo Dictionary preparation succeeded
+echo -n "" >$dir/extra_questions.txt
 
-exit 0 
+echo "$0: Dictionary preparation succeeded"
diff --git a/egs/gale_arabic/s5b/local/prepare_lexicon.py b/egs/gale_arabic/s5b/local/prepare_lexicon.py
new file mode 100755
index 00000000000..215541585eb
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/prepare_lexicon.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon.
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data','local', 'lexicon_data', 'processed_lexicon')
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line = line.strip()
+        characters = list(line)
+        characters = " ".join(['V' if char == '*' else char for char in characters])
+        lex[line] = characters
+
+with open(os.path.join('data','local','dict', 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + "  " + lex[key] + "\n")
diff --git a/egs/gale_arabic/s5b/local/wer_output_filter b/egs/gale_arabic/s5b/local/wer_output_filter
new file mode 100755
index 00000000000..ee5c8809ca7
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/wer_output_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Hossein Hadian
+
+# Apache 2.0
+# This script converts a BPE-encoded text to normal text. It is used in scoring
+
+import sys, io
+import string
+
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+for line in infile:
+  words = line.strip().split()
+  words = [word for word in words if '<unk>' not in word]
+  uttid = words[0]
+  transcript = ' '.join(words[1:])
+  output.write(uttid + ' ' + transcript + '\n')

From 741c4be023ab3a9627593cd5a11b28bed3fc3fbb Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Wed, 2 Jan 2019 00:18:20 -0500
Subject: [PATCH 07/13] fixing bug

---
 egs/gale_arabic/s5b/local/prepare_lm.sh | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/egs/gale_arabic/s5b/local/prepare_lm.sh b/egs/gale_arabic/s5b/local/prepare_lm.sh
index 571ae1200df..0fe1b60c333 100755
--- a/egs/gale_arabic/s5b/local/prepare_lm.sh
+++ b/egs/gale_arabic/s5b/local/prepare_lm.sh
@@ -8,8 +8,9 @@
 
 echo "=== Building a language model ..."
 
-locdata=data/local/lm/
-mkdir -p $locdata
+locdata dir=data/local/lm/
+text=data/local/train/text
+lexicon=data/local/dict/lexicon.txt
 
 # Language model order
 order=3
@@ -17,7 +18,11 @@ order=3
 . utils/parse_options.sh
 
 # Prepare a LM training corpus from the transcripts
-mkdir -p $locdata
+mkdir -p $dir
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
 
 loc=`which ngram-count`;
 if [ -z $loc ]; then
@@ -37,10 +42,11 @@ if [ -z $loc ]; then
   fi
 fi
 
-cat data/train/text | cut -d " " -f 2- >  $locdata/train.txt
+cat data/train/text | cut -d " " -f 2- >  $dir/train.txt
+cut -d' ' -f1 $lexicon > $dir/wordlist
 
-ngram-count -text $locdata/train.txt -order $order -interpolate \
-  -kndiscount -lm $locdata/lm.gz
+ngram-count -text $dir/train.txt -order $order -limit-vocab -vocab $dir/wordlist \
+  -unk -map-unk "<unk>" -kndiscount -interpolate -lm $dir/lm.gz
 
-#ngram -lm $locdata/lm.gz -ppl $locdata/dev.txt
+#ngram -lm $dir/lm.gz -ppl $dir/dev.txt
 echo "*** Finished building the LM model!"

From 8b3ff02ac96ef1515724632fefa933db47f3d9c5 Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Wed, 2 Jan 2019 00:43:05 -0500
Subject: [PATCH 08/13] minor fix

---
 egs/gale_arabic/s5b/local/prepare_lm.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/egs/gale_arabic/s5b/local/prepare_lm.sh b/egs/gale_arabic/s5b/local/prepare_lm.sh
index 0fe1b60c333..e28c7932e23 100755
--- a/egs/gale_arabic/s5b/local/prepare_lm.sh
+++ b/egs/gale_arabic/s5b/local/prepare_lm.sh
@@ -8,8 +8,8 @@
 
 echo "=== Building a language model ..."
 
-locdata dir=data/local/lm/
-text=data/local/train/text
+dir=data/local/lm/
+text=data/train/text
 lexicon=data/local/dict/lexicon.txt
 
 # Language model order
@@ -46,7 +46,7 @@ cat data/train/text | cut -d " " -f 2- >  $dir/train.txt
 cut -d' ' -f1 $lexicon > $dir/wordlist
 
 ngram-count -text $dir/train.txt -order $order -limit-vocab -vocab $dir/wordlist \
-  -unk -map-unk "<unk>" -kndiscount -interpolate -lm $dir/lm.gz
+  -unk -map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.gz
 
 #ngram -lm $dir/lm.gz -ppl $dir/dev.txt
 echo "*** Finished building the LM model!"

From ea38cda4ba6f979e14a3fe0ed6c8c448672addc8 Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Thu, 10 Jan 2019 16:49:00 -0500
Subject: [PATCH 09/13] adding train sat basis, unk phone is dict, minor
 changes

---
 egs/gale_arabic/s5b/cmd.sh                    |  6 ++--
 .../s5b/local/chain/tuning/run_tdnn_1a.sh     |  4 +--
 .../s5b/local/nnet3/run_ivector_common.sh     |  4 +--
 egs/gale_arabic/s5b/local/prepare_data.sh     | 15 +++++-----
 egs/gale_arabic/s5b/local/prepare_dict.sh     |  2 ++
 egs/gale_arabic/s5b/local/prepare_lm.sh       |  5 ++--
 egs/gale_arabic/s5b/local/wer_output_filter   |  2 +-
 egs/gale_arabic/s5b/run.sh                    | 30 ++++++++++++++-----
 8 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/egs/gale_arabic/s5b/cmd.sh b/egs/gale_arabic/s5b/cmd.sh
index 71dd849a93b..ea341c98d4a 100755
--- a/egs/gale_arabic/s5b/cmd.sh
+++ b/egs/gale_arabic/s5b/cmd.sh
@@ -10,6 +10,6 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-export train_cmd="queue.pl --mem 2G"
-export decode_cmd="queue.pl --mem 4G"
-export mkgraph_cmd="queue.pl --mem 8G"
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="retry.pl queue.pl --mem 4G"
+export mkgraph_cmd="retry.pl queue.pl --mem 8G"
diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
index b5486decc31..7d19b88fcc7 100755
--- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -19,7 +19,7 @@ stage=0
 nj=30
 train_set=train
 test_set=test
-gmm=tri2b        # this is the source gmm-dir that we'll use for alignments; it
+gmm=tri3b        # this is the source gmm-dir that we'll use for alignments; it
                  # should have alignments for the specified training data.
 num_threads_ubm=32
 nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
@@ -201,7 +201,7 @@ if [ $stage -le 17 ]; then
   utils/lang/check_phones_compatible.sh \
     data/lang_test/phones.txt $lang/phones.txt
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang \
+    --self-loop-scale 1.0 data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 fi
 
diff --git a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
index 5dc0818393b..f071842dc0b 100755
--- a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
+++ b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
@@ -12,7 +12,7 @@ stage=0
 nj=100
 train_set=train   # you might set this to e.g. train.
 test_sets="test"
-gmm=tri2b # This specifies a GMM-dir from the features of the type you're training the system on;
+gmm=tri3b # This specifies a GMM-dir from the features of the type you're training the system on;
                          # it should contain alignments for 'train_set'.
 
 num_threads_ubm=32
@@ -146,7 +146,7 @@ if [ $stage -le 5 ]; then
 fi
 
 if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 7 ]; then
-  echo "$0: $feats already exists.  Refusing to overwrite the features "
+  echo "$0: data/${train_set}_sp/feats.scp already exists.  Refusing to overwrite the features "
   echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
   exit 1;
 fi
diff --git a/egs/gale_arabic/s5b/local/prepare_data.sh b/egs/gale_arabic/s5b/local/prepare_data.sh
index 1561928bb48..aea9ba2dc8e 100755
--- a/egs/gale_arabic/s5b/local/prepare_data.sh
+++ b/egs/gale_arabic/s5b/local/prepare_data.sh
@@ -15,20 +15,20 @@ mkdir -p $gale_data
 # check that sox is installed 
 which sox  &>/dev/null
 if [[ $? != 0 ]]; then 
- echo "sox is not installed"; exit 1 
+ echo "$0: sox is not installed"; exit 1
 fi
 
 for dvd in $dir1 $dir2 $dir3; do
   dvd_full_path=$(utils/make_absolute.sh $dvd)
   if [[ ! -e $dvd_full_path ]]; then 
-    echo missing $dvd_full_path; exit 1;
+    echo "$0: missing $dvd_full_path"; exit 1;
   fi
   find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \)  | while read file; do
     id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}')
     echo "$id sox $file -r 16000 -t wav - |"
   done 
 done | sort -u > $gale_data/wav.scp
-echo data prep audio succeded
+echo "$0:data prep audio succeded"
 
 gale_data=$(utils/make_absolute.sh "GALE" );
 top_pwd=`pwd`
@@ -36,13 +36,13 @@ txtdir=$gale_data/txt
 mkdir -p $txtdir; cd $txtdir
 
 for cdx in $text1 $text2 $text3; do
-  echo "Preparing $cdx"
+  echo "$0:Preparing $cdx"
   if [[ $cdx  == *.tgz ]] ; then
      tar -xvf $cdx
   elif [  -d "$cdx" ]; then
     ln -s $cdx `basename $cdx`
   else
-    echo "I don't really know what I shall do with $cdx " >&2
+    echo "$0:I don't really know what I shall do with $cdx " >&2
   fi
 done
 
@@ -78,7 +78,7 @@ awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::'
 cd ..;
 rm -fr $txtdir
 cd $top_pwd
-echo data prep text succeeded
+echo "$0:dat a prep text succeeded"
 
 mkdir -p data
 dir=$(utils/make_absolute.sh data/)
@@ -100,6 +100,5 @@ grep -f local/test_list $gale_data/wav.scp > $dir/test/wav.scp
 cat $gale_data/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
  {if (seen[$1]) { print $0}}' > $dir/train/wav.scp
  
-echo data prep split succeeded
-
+echo "$0:data prep split succeeded"
 exit 0
diff --git a/egs/gale_arabic/s5b/local/prepare_dict.sh b/egs/gale_arabic/s5b/local/prepare_dict.sh
index 5feef445357..47b5869fdf1 100755
--- a/egs/gale_arabic/s5b/local/prepare_dict.sh
+++ b/egs/gale_arabic/s5b/local/prepare_dict.sh
@@ -35,6 +35,8 @@ cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '
 
 sed -i '1i<UNK> UNK' $dir/lexicon.txt
 
+echo UNK >> $dir/nonsilence_phones.txt
+
 echo '<sil> SIL' >> $dir/lexicon.txt
 
 echo SIL > $dir/silence_phones.txt
diff --git a/egs/gale_arabic/s5b/local/prepare_lm.sh b/egs/gale_arabic/s5b/local/prepare_lm.sh
index e28c7932e23..6fdf35f471a 100755
--- a/egs/gale_arabic/s5b/local/prepare_lm.sh
+++ b/egs/gale_arabic/s5b/local/prepare_lm.sh
@@ -11,7 +11,6 @@ echo "=== Building a language model ..."
 dir=data/local/lm/
 text=data/train/text
 lexicon=data/local/dict/lexicon.txt
-
 # Language model order
 order=3
 
@@ -42,10 +41,10 @@ if [ -z $loc ]; then
   fi
 fi
 
-cat data/train/text | cut -d " " -f 2- >  $dir/train.txt
+cat data/train/text | cut -d " " -f 2- >  $dir/text.txt
 cut -d' ' -f1 $lexicon > $dir/wordlist
 
-ngram-count -text $dir/train.txt -order $order -limit-vocab -vocab $dir/wordlist \
+ngram-count -text $dir/text.txt -order $order -limit-vocab -vocab $dir/wordlist \
   -unk -map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.gz
 
 #ngram -lm $dir/lm.gz -ppl $dir/dev.txt
diff --git a/egs/gale_arabic/s5b/local/wer_output_filter b/egs/gale_arabic/s5b/local/wer_output_filter
index ee5c8809ca7..cf48b434144 100755
--- a/egs/gale_arabic/s5b/local/wer_output_filter
+++ b/egs/gale_arabic/s5b/local/wer_output_filter
@@ -13,7 +13,7 @@ output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
 
 for line in infile:
   words = line.strip().split()
-  words = [word for word in words if '<unk>' not in word]
+  words = [word for word in words if '<UNK>' not in word]
   uttid = words[0]
   transcript = ' '.join(words[1:])
   output.write(uttid + ' ' + transcript + '\n')
diff --git a/egs/gale_arabic/s5b/run.sh b/egs/gale_arabic/s5b/run.sh
index b2c10ec0a94..3f12d22495e 100755
--- a/egs/gale_arabic/s5b/run.sh
+++ b/egs/gale_arabic/s5b/run.sh
@@ -5,7 +5,7 @@
 
 num_jobs=120
 num_decode_jobs=40
-decode_gmm=false
+decode_gmm=true
 stage=0
 overwrite=false
 
@@ -43,7 +43,7 @@ if [ $stage -le 0 ]; then
   local/prepare_lm.sh
 
   utils/format_lm.sh data/lang data/local/lm/lm.gz \
-                     data/local/dict/lexicon.txt data/lang
+                     data/local/dict/lexicon.txt data/lang_test
 fi
 
 mfccdir=mfcc
@@ -86,12 +86,8 @@ if [ $stage -le 5 ]; then
   steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
     data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
 
-  # train and decode tri2b [LDA+MLLT]
   steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \
     data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
-
-  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
-    --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali  || exit 1;
 fi
 
 if [ $stage -le 6 ] && $decode_gmm; then
@@ -101,9 +97,27 @@ if [ $stage -le 6 ] && $decode_gmm; then
 fi
 
 if [ $stage -le 7 ]; then
+  echo "$0: Aligning data and retraining and realigning with sat_basis"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri2b exp/tri2b_ali || exit 1;
+
+  steps/train_sat_basis.sh --cmd "$train_cmd" \
+    5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1;
+
+  steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
+fi
+
+if [ $stage -le 8 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph
+  steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \
+    "$decode_cmd" exp/tri3b/graph data/test exp/tri3b/decode
+fi
+
+if [ $stage -le 9 ]; then
   echo "$0: Training a regular chain model using the e2e alignments..."
-  local/chain/run_tdnn.sh      #tdnn recipe:
+  local/chain/run_tdnn.sh
 fi
 
-echo training succedded
+echo "$0: training succedded"
 exit 0

From 8bc9325c31510ddeb89ea48840b2c0fc583ce304 Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Thu, 10 Jan 2019 18:28:21 -0500
Subject: [PATCH 10/13] updating results

---
 egs/gale_arabic/s5b/RESULTS                   |  5 ++++
 .../s5b/local/chain/tuning/run_tdnn_1a.sh     | 24 +++++++++----------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/egs/gale_arabic/s5b/RESULTS b/egs/gale_arabic/s5b/RESULTS
index 2260a106654..dec0287b33c 100644
--- a/egs/gale_arabic/s5b/RESULTS
+++ b/egs/gale_arabic/s5b/RESULTS
@@ -70,3 +70,8 @@ Combined Results for Reports and Conversational WER:
 %WER 36.17 [ 25196 / 69668, 2429 ins, 5393 del, 17374 sub ] exp/tri2b/decode/wer_16
 %WER 39.42 [ 27462 / 69668, 2473 ins, 6051 del, 18938 sub ] exp/tri2a/decode/wer_14
 %WER 40.35 [ 28113 / 69668, 2713 ins, 5635 del, 19765 sub ] exp/tri1/decode/wer_13
+
+
+WER 33.35 [ 23233 / 69668, 2385 ins, 5274 del, 15574 sub ] exp/tri3b/decode/wer_16_0.5 (train_sat_basis)
+current best 'chain' models  (see local/chain/tuning/run_tdnn_1a.sh)
+%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_10_0.0
diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
index 7d19b88fcc7..dbdd7157cea 100755
--- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -1,18 +1,16 @@
 #!/bin/bash
 
-# ./local/chain/compare_wer.sh exp/chain/tdnn_1b/
-# System                      tdnn_1b
-# WER                             17.23
-# CER                              6.83
-# Final train prob              -0.0825
-# Final valid prob              -0.0987
-# Final train prob (xent)       -0.6611
-# Final valid prob (xent)       -0.7393
-
-# head exp/chain/tdnn_1b/decode_test_rnnlm_1e_2_0.40/scoring_kaldi/best_wer
-# WER 16.58 [ 11549 / 69668, 1290 ins, 2389 del, 7870 sub ] exp/chain/tdnn1c_swbd_sp/decode_test_rnnlm_1e_2_0.40/wer_10_0.5
-
-# exp/chain/tdnn_1b/: num-iters=441 nj=3..16 num-params=16.5M dim=40+100->1792 combine=-0.081->-0.081 (over 6) xent:train/valid[293,440,final]=(-0.937,-0.659,-0.661/-0.960,-0.739,-0.739) logprob:train/valid[293,440,final]=(-0.124,-0.083,-0.083/-0.127,-0.100,-0.099)
+# ./local/chain/compare_wer.sh exp/chain/tdnn_1a_sp
+# System                      tdnn_1a_sp
+# WER                             16.66
+# CER                              6.70
+# Final train prob              -0.0674
+# Final valid prob              -0.0832
+# Final train prob (xent)       -0.8575
+# Final valid prob (xent)       -0.9472
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1a_sp/
+# exp/chain/tdnn_1a_sp/: num-iters=441 nj=3..16 num-params=17.2M dim=40+100->3024 combine=-0.064->-0.064 (over 5) xent:train/valid[293,440,final]=(-1.17,-0.868,-0.858/-1.24,-0.956,-0.947) logprob:train/valid[293,440,final]=(-0.102,-0.068,-0.067/-0.113,-0.084,-0.083)
 
 set -e -o pipefail
 stage=0

From fb90d785033e98785284fd1b092cbee7012eeb2a Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Mon, 21 Jan 2019 13:33:42 -0500
Subject: [PATCH 11/13] updating results

---
 egs/gale_arabic/s5b/RESULTS                   | 20 +++++++++++++++++--
 .../s5b/local/chain/tuning/run_tdnn_1a.sh     | 17 ++++++++--------
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/egs/gale_arabic/s5b/RESULTS b/egs/gale_arabic/s5b/RESULTS
index dec0287b33c..9ae9ba81b7a 100644
--- a/egs/gale_arabic/s5b/RESULTS
+++ b/egs/gale_arabic/s5b/RESULTS
@@ -72,6 +72,22 @@ Combined Results for Reports and Conversational WER:
 %WER 40.35 [ 28113 / 69668, 2713 ins, 5635 del, 19765 sub ] exp/tri1/decode/wer_13
 
 
-WER 33.35 [ 23233 / 69668, 2385 ins, 5274 del, 15574 sub ] exp/tri3b/decode/wer_16_0.5 (train_sat_basis)
-current best 'chain' models  (see local/chain/tuning/run_tdnn_1a.sh)
+# Effect of GMM seed model (tri2b instead of tri3b).  Using tri3b give a slightly better result
+# as compared to using tri2b as seed.
 %WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_10_0.0
+%WER 16.71 [ 11642 / 69668, 1145 ins, 2908 del, 7589 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_9_0.0
+
+# WER with train_sat_basis
+%WER 33.35 [ 23233 / 69668, 2385 ins, 5274 del, 15574 sub ] exp/tri3b/decode/wer_16_0.5
+
+# Effect of Tree-size (3500, 4500, 7000)
+%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3500_sp/decode_test/wer_10_0.0
+%WER 16.59 [ 11557 / 69668, 1234 ins, 2646 del, 7677 sub ] exp/chain/tdnn_1a_4500_sp/decode_test/wer_10_0.0
+%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_sp/decode_test/wer_9_0.0
+
+# Effect of l2-regularization on the output with tree-size=7000. l2 on the output (0.005,0.002)
+%WER 16.54 [ 11522 / 69668, 1123 ins, 2739 del, 7660 sub ] exp/chain/tdnn_1a_7000_05_sp/decode_test/wer_9_0.5
+%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_sp/decode_test/wer_9_0.0
+
+#current best 'chain' models  (see local/chain/tuning/run_tdnn_1a.sh)
+%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_9_0.0
diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
index dbdd7157cea..a3ccfda04ac 100755
--- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -2,15 +2,16 @@
 
 # ./local/chain/compare_wer.sh exp/chain/tdnn_1a_sp
 # System                      tdnn_1a_sp
-# WER                             16.66
-# CER                              6.70
-# Final train prob              -0.0674
-# Final valid prob              -0.0832
-# Final train prob (xent)       -0.8575
-# Final valid prob (xent)       -0.9472
+# WER                             16.47
+# CER                              6.68
+# Final train prob              -0.0652
+# Final valid prob              -0.0831
+# Final train prob (xent)       -0.8965
+# Final valid prob (xent)       -0.9964
 
 # steps/info/chain_dir_info.pl exp/chain/tdnn_1a_sp/
-# exp/chain/tdnn_1a_sp/: num-iters=441 nj=3..16 num-params=17.2M dim=40+100->3024 combine=-0.064->-0.064 (over 5) xent:train/valid[293,440,final]=(-1.17,-0.868,-0.858/-1.24,-0.956,-0.947) logprob:train/valid[293,440,final]=(-0.102,-0.068,-0.067/-0.113,-0.084,-0.083)
+# exp/chain/tdnn_1a_sp/: num-iters=441 nj=3..16 num-params=18.6M dim=40+100->5816 combine=-0.063->-0.062 (over 6) xent:train/valid[293,440,final]=(-1.22,-0.912,-0.896/-1.29,-1.01,-0.996) logprob:train/valid[293,440,final]=(-0.097,-0.066,-0.065/-0.108,-0.084,-0.083)
+
 
 set -e -o pipefail
 stage=0
@@ -98,7 +99,7 @@ if $run_chain_common; then
                                   --lores-train-data-dir ${lores_train_data_dir} \
                                   --lang $lang \
                                   --lat-dir $lat_dir \
-                                  --num-leaves 3500 \
+                                  --num-leaves 7000 \
                                   --tree-dir $tree_dir || exit 1;
 fi
 

From 1abadddcd3d0fe774d31c191a657b6dce3a02f04 Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Mon, 21 Jan 2019 14:17:54 -0500
Subject: [PATCH 12/13] minor fix

---
 egs/gale_arabic/s5b/RESULTS | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/egs/gale_arabic/s5b/RESULTS b/egs/gale_arabic/s5b/RESULTS
index 9ae9ba81b7a..b0ff31615c7 100644
--- a/egs/gale_arabic/s5b/RESULTS
+++ b/egs/gale_arabic/s5b/RESULTS
@@ -80,10 +80,11 @@ Combined Results for Reports and Conversational WER:
 # WER with train_sat_basis
 %WER 33.35 [ 23233 / 69668, 2385 ins, 5274 del, 15574 sub ] exp/tri3b/decode/wer_16_0.5
 
-# Effect of Tree-size (3500, 4500, 7000)
+# Effect of Tree-size (3500, 4500, 7000, 11000)
 %WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3500_sp/decode_test/wer_10_0.0
 %WER 16.59 [ 11557 / 69668, 1234 ins, 2646 del, 7677 sub ] exp/chain/tdnn_1a_4500_sp/decode_test/wer_10_0.0
 %WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_sp/decode_test/wer_9_0.0
+%WER 16.62 [ 11580 / 69668, 1164 ins, 2789 del, 7627 sub ] exp/chain/tdnn_1a_11000_sp/decode_test/wer_10_0.0
 
 # Effect of l2-regularization on the output with tree-size=7000. l2 on the output (0.005,0.002)
 %WER 16.54 [ 11522 / 69668, 1123 ins, 2739 del, 7660 sub ] exp/chain/tdnn_1a_7000_05_sp/decode_test/wer_9_0.5

From 42aa3915922ad55bfaa109c580a771c86a7e8f83 Mon Sep 17 00:00:00 2001
From: aarora8 <aarora8@jhu.edu>
Date: Tue, 22 Jan 2019 15:41:06 -0500
Subject: [PATCH 13/13] modification from the review

---
 egs/gale_arabic/s5b/RESULTS | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/egs/gale_arabic/s5b/RESULTS b/egs/gale_arabic/s5b/RESULTS
index b0ff31615c7..e0fb9d38ceb 100644
--- a/egs/gale_arabic/s5b/RESULTS
+++ b/egs/gale_arabic/s5b/RESULTS
@@ -65,6 +65,9 @@ Combined Results for Reports and Conversational WER:
 %WER 32.36 [ 22542 / 69668, 2156 ins, 4184 del, 16202 sub ] exp/tri2b_mmi/decode_it4/wer_11
 %WER 32.50 [ 22640 / 69668, 2393 ins, 3956 del, 16291 sub ] exp/tri2b_mmi/decode_it3/wer_11
 %WER 32.79 [ 22847 / 69668, 2407 ins, 4760 del, 15680 sub ] exp/tri2b_mpe/decode_it3/wer_13
+# WER with train_sat_basis
+%WER 33.35 [ 23233 / 69668, 2385 ins, 5274 del, 15574 sub ] exp/tri3b/decode/wer_16_0.5
+# WER with train_sat
 %WER 33.61 [ 23413 / 69668, 2817 ins, 4577 del, 16019 sub ] exp/tri3b/decode/wer_17
 %WER 35.73 [ 24894 / 69668, 2630 ins, 4944 del, 17320 sub ] exp/tri3b/decode.si/wer_15
 %WER 36.17 [ 25196 / 69668, 2429 ins, 5393 del, 17374 sub ] exp/tri2b/decode/wer_16
@@ -74,11 +77,8 @@ Combined Results for Reports and Conversational WER:
 
 # Effect of GMM seed model (tri2b instead of tri3b).  Using tri3b give a slightly better result
 # as compared to using tri2b as seed.
-%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_10_0.0
-%WER 16.71 [ 11642 / 69668, 1145 ins, 2908 del, 7589 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_9_0.0
-
-# WER with train_sat_basis
-%WER 33.35 [ 23233 / 69668, 2385 ins, 5274 del, 15574 sub ] exp/tri3b/decode/wer_16_0.5
+%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3b_sp/decode_test/wer_10_0.0
+%WER 16.71 [ 11642 / 69668, 1145 ins, 2908 del, 7589 sub ] exp/chain/tdnn_1a_2b_sp/decode_test/wer_9_0.0
 
 # Effect of Tree-size (3500, 4500, 7000, 11000)
 %WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3500_sp/decode_test/wer_10_0.0
@@ -87,8 +87,8 @@ Combined Results for Reports and Conversational WER:
 %WER 16.62 [ 11580 / 69668, 1164 ins, 2789 del, 7627 sub ] exp/chain/tdnn_1a_11000_sp/decode_test/wer_10_0.0
 
 # Effect of l2-regularization on the output with tree-size=7000. l2 on the output (0.005,0.002)
-%WER 16.54 [ 11522 / 69668, 1123 ins, 2739 del, 7660 sub ] exp/chain/tdnn_1a_7000_05_sp/decode_test/wer_9_0.5
-%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_sp/decode_test/wer_9_0.0
+%WER 16.54 [ 11522 / 69668, 1123 ins, 2739 del, 7660 sub ] exp/chain/tdnn_1a_7000_005_sp/decode_test/wer_9_0.5
+%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_002_sp/decode_test/wer_9_0.0
 
 #current best 'chain' models  (see local/chain/tuning/run_tdnn_1a.sh)
 %WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_9_0.0