From 827ddbfff87c0bc5e9ed0ed71000e325268fa7b0 Mon Sep 17 00:00:00 2001
From: xiaohui-zhang <samuelzhang1104@gmail.com>
Date: Tue, 9 Jan 2018 13:07:06 -0500
Subject: [PATCH 1/7] multi_en: Fixed acronym normalization, swbd lexicon
 preparation, OOV pronunciation generation, acoustic data sub-sampling,.etc;
 Added hub4_97 data

---
 egs/multi_en/s5/README.md                     |   2 +-
 egs/multi_en/s5/RESULTS                       |  21 +-
 egs/multi_en/s5/conf/mfcc.conf                |   1 +
 egs/multi_en/s5/local/g2p/apply_g2p.sh        |   2 +-
 egs/multi_en/s5/local/hub4_96_data_prep.sh    |  52 ++++
 egs/multi_en/s5/local/hub4_96_parse_sgm.pl    | 235 +++++++++++++++++
 egs/multi_en/s5/local/hub4_97_data_prep.sh    |  50 ++++
 egs/multi_en/s5/local/hub4_97_parse_sgm.pl    | 235 +++++++++++++++++
 egs/multi_en/s5/local/hub4_data_prep.py       | 242 ------------------
 egs/multi_en/s5/local/hub4_en_data_prep.sh    |  62 +++++
 egs/multi_en/s5/local/hub4_format_data.pl     | 138 ++++++++++
 .../local/hub4_normalize_bn96_transcripts.pl  |  33 +++
 .../local/hub4_normalize_bn97_transcripts.pl  |  42 +++
 egs/multi_en/s5/local/hub4_utils.py           | 174 -------------
 .../s5/local/librispeech_data_prep.sh         |  13 +-
 egs/multi_en/s5/local/make_partitions.sh      |   8 +-
 egs/multi_en/s5/local/swbd1_data_prep.sh      |  10 +-
 egs/multi_en/s5/local/tedlium_prepare_data.sh |   3 +-
 egs/multi_en/s5/local/wsj_data_prep.sh        |   3 +-
 egs/multi_en/s5/run.sh                        |  64 ++---
 20 files changed, 915 insertions(+), 475 deletions(-)
 mode change 100644 => 100755 egs/multi_en/s5/RESULTS
 create mode 100755 egs/multi_en/s5/local/hub4_96_data_prep.sh
 create mode 100755 egs/multi_en/s5/local/hub4_96_parse_sgm.pl
 create mode 100755 egs/multi_en/s5/local/hub4_97_data_prep.sh
 create mode 100755 egs/multi_en/s5/local/hub4_97_parse_sgm.pl
 delete mode 100755 egs/multi_en/s5/local/hub4_data_prep.py
 create mode 100755 egs/multi_en/s5/local/hub4_en_data_prep.sh
 create mode 100755 egs/multi_en/s5/local/hub4_format_data.pl
 create mode 100755 egs/multi_en/s5/local/hub4_normalize_bn96_transcripts.pl
 create mode 100755 egs/multi_en/s5/local/hub4_normalize_bn97_transcripts.pl
 delete mode 100644 egs/multi_en/s5/local/hub4_utils.py

diff --git a/egs/multi_en/s5/README.md b/egs/multi_en/s5/README.md
index 0affcb9cf08..20505c5af6f 100755
--- a/egs/multi_en/s5/README.md
+++ b/egs/multi_en/s5/README.md
@@ -2,7 +2,7 @@ This is a WIP **English LVCSR recipe** that trains on data from multiple corpora
 * Fisher (1761 hours)
 * Switchboard (317 hours)
 * WSJ (81 hours)
-* HUB4 English Broadcast News (76 hours)
+* HUB4 (1996 & 1997) English Broadcast News (75 + 72 hours)
 * TED-LIUM (118 hours)
 * Librispeech (960 hours)
 
diff --git a/egs/multi_en/s5/RESULTS b/egs/multi_en/s5/RESULTS
old mode 100644
new mode 100755
index 24b82755b94..2b7e5329f20
--- a/egs/multi_en/s5/RESULTS
+++ b/egs/multi_en/s5/RESULTS
@@ -37,17 +37,16 @@ exit 0
 
 # Results with the current data combination, lexicon preparation, and acoustic model training procedures.
 # On eval2000 the final GMM results is 24.3, which is better than the above result (24.9). 
-
-multi_a  tri1b  tg_eval2000        ||  %WER 40.3 | 4459 42989 | 63.7 26.1 10.2 4.0 40.3 72.9 | exp/multi_a/tri1b/decode_tg_eval2000/score_10_0.5/eval2000.ctm.filt.sys
-multi_a  tri1b  tg_eval2000.si     ||  %WER 45.3 | 4459 42989 | 59.2 29.3 11.4 4.6 45.3 75.4 | exp/multi_a/tri1b/decode_tg_eval2000.si/score_11_0.0/eval2000.ctm.filt.sys
-multi_a  tri3a  tg_eval2000        ||  %WER 33.3 | 4459 42989 | 70.4 21.0 8.6 3.7 33.3 69.6 | exp/multi_a/tri3a/decode_tg_eval2000/score_11_1.0/eval2000.ctm.filt.sys
-multi_a  tri3a  tg_eval2000.si     ||  %WER 38.5 | 4459 42989 | 65.9 24.7 9.5 4.4 38.5 72.5 | exp/multi_a/tri3a/decode_tg_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys
-multi_a  tri3b  tg_eval2000        ||  %WER 27.9 | 4459 42989 | 75.8 17.9 6.3 3.7 27.9 67.1 | exp/multi_a/tri3b/decode_tg_eval2000/score_11_0.0/eval2000.ctm.filt.sys
-multi_a  tri3b  tg_eval2000.si     ||  %WER 31.6 | 4459 42989 | 71.9 20.3 7.8 3.5 31.6 68.8 | exp/multi_a/tri3b/decode_tg_eval2000.si/score_12_0.0/eval2000.ctm.filt.sys
-multi_a  tri3b  tg_sp_eval2000     ||  %WER 26.7 | 4459 42989 | 77.2 17.1 5.7 3.9 26.7 65.6 | exp/multi_a/tri3b/decode_tg_sp_eval2000/score_11_0.0/eval2000.ctm.filt.sys
-multi_a  tri3b  tg_sp_eval2000.si  ||  %WER 30.6 | 4459 42989 | 73.1 19.6 7.3 3.8 30.6 68.2 | exp/multi_a/tri3b/decode_tg_sp_eval2000.si/score_12_1.0/eval2000.ctm.filt.sys
-multi_a  tri4   tg_eval2000        ||  %WER 24.8 | 4459 42989 | 78.5 16.0 5.5 3.4 24.8 63.8 | exp/multi_a/tri4/decode_tg_eval2000/score_12_1.0/eval2000.ctm.filt.sys
-multi_a  tri4   tg_eval2000.si     ||  %WER 31.2 | 4459 42989 | 72.6 20.6 6.8 3.9 31.2 67.6 | exp/multi_a/tri4/decode_tg_eval2000.si/score_12_0.0/eval2000.ctm.filt.sys
+multi_a  tri1b  tg_eval2000        ||  %WER 40.4 | 4459 42989 | 63.8 25.9 10.3 4.2 40.4 72.7 | exp/multi_a/tri1b/decode_tg_eval2000/score_10_0.5/eval2000.ctm.filt.sys
+multi_a  tri1b  tg_eval2000.si     ||  %WER 45.0 | 4459 42989 | 59.3 28.8 11.9 4.3 45.0 75.0 | exp/multi_a/tri1b/decode_tg_eval2000.si/score_12_0.0/eval2000.ctm.filt.sys
+multi_a  tri3a  tg_eval2000        ||  %WER 33.4 | 4459 42989 | 70.5 21.3 8.3 3.9 33.4 69.7 | exp/multi_a/tri3a/decode_tg_eval2000/score_11_0.0/eval2000.ctm.filt.sys
+multi_a  tri3a  tg_eval2000.si     ||  %WER 38.4 | 4459 42989 | 66.2 24.2 9.6 4.6 38.4 72.3 | exp/multi_a/tri3a/decode_tg_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys
+multi_a  tri3b  tg_eval2000        ||  %WER 27.8 | 4459 42989 | 75.7 17.8 6.6 3.5 27.8 66.6 | exp/multi_a/tri3b/decode_tg_eval2000/score_12_0.0/eval2000.ctm.filt.sys
+multi_a  tri3b  tg_eval2000.si     ||  %WER 31.7 | 4459 42989 | 71.8 20.3 7.8 3.6 31.7 69.0 | exp/multi_a/tri3b/decode_tg_eval2000.si/score_12_0.5/eval2000.ctm.filt.sys
+multi_a  tri3b  tg_sp_eval2000     ||  %WER 26.8 | 4459 42989 | 77.0 17.3 5.7 3.8 26.8 65.2 | exp/multi_a/tri3b/decode_tg_sp_eval2000/score_11_1.0/eval2000.ctm.filt.sys
+multi_a  tri3b  tg_sp_eval2000.si  ||  %WER 30.5 | 4459 42989 | 73.7 19.7 6.7 4.2 30.5 68.0 | exp/multi_a/tri3b/decode_tg_sp_eval2000.si/score_11_0.0/eval2000.ctm.filt.sys
+multi_a  tri4   tg_eval2000        ||  %WER 24.8 | 4459 42989 | 78.6 15.8 5.5 3.5 24.8 64.1 | exp/multi_a/tri4/decode_tg_eval2000/score_12_1.0/eval2000.ctm.filt.sys
+multi_a  tri4   tg_eval2000.si     ||  %WER 31.3 | 4459 42989 | 73.1 20.8 6.2 4.4 31.3 68.7 | exp/multi_a/tri4/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys
 multi_a  tri5a  tg_eval2000        ||  %WER 24.3 | 4459 42989 | 78.8 15.6 5.6 3.2 24.3 63.3 | exp/multi_a/tri5a/decode_tg_eval2000/score_13_0.0/eval2000.ctm.filt.sys
 multi_a  tri5a  tg_eval2000.si     ||  %WER 30.6 | 4459 42989 | 73.7 20.2 6.1 4.3 30.6 67.9 | exp/multi_a/tri5a/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys
 multi_a  tri5a  tg_sp_eval2000     ||  %WER 24.2 | 4459 42989 | 79.1 15.6 5.3 3.3 24.2 63.2 | exp/multi_a/tri5a/decode_tg_sp_eval2000/score_12_0.0/eval2000.ctm.filt.sys
diff --git a/egs/multi_en/s5/conf/mfcc.conf b/egs/multi_en/s5/conf/mfcc.conf
index 4f780bf520c..9a17e801b3f 100644
--- a/egs/multi_en/s5/conf/mfcc.conf
+++ b/egs/multi_en/s5/conf/mfcc.conf
@@ -2,3 +2,4 @@
 --sample-frequency=8000
 --low-freq=20
 --high-freq=3700
+--allow-downsample=true
diff --git a/egs/multi_en/s5/local/g2p/apply_g2p.sh b/egs/multi_en/s5/local/g2p/apply_g2p.sh
index 88b37f21ad8..f8e50302c29 100755
--- a/egs/multi_en/s5/local/g2p/apply_g2p.sh
+++ b/egs/multi_en/s5/local/g2p/apply_g2p.sh
@@ -33,7 +33,7 @@ cat data/*/train/text | \
   perl -ape 's/\s/\n/g;' | \
   sort | uniq > $workdir/missing.txt
 cat $workdir/missing.txt | \
-  grep "^[a-z0-9.'_-]*$"  > $workdir/missing_onlywords.txt
+  grep "^[a-z]*$"  > $workdir/missing_onlywords.txt
 
 echo 'Synthesizing pronunciations for missing words...'
 phonetisaurus-apply --nbest $var_counts --model $model --thresh 5 --accumulate --word_list $workdir/missing_onlywords.txt > $workdir/missing_g2p_${var_counts}.txt 
diff --git a/egs/multi_en/s5/local/hub4_96_data_prep.sh b/egs/multi_en/s5/local/hub4_96_data_prep.sh
new file mode 100755
index 00000000000..f258ea7b7f5
--- /dev/null
+++ b/egs/multi_en/s5/local/hub4_96_data_prep.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+###########################################################################################
+# This script was copied from egs/hub4_english/s5/local/data_prep/prepare_1996_bn_data.sh
+# The source commit was 191ae0a6e5db19d316c82a78c746bcd56cc2d7da
+# Changes in lower level script/dir names were made
+###########################################################################################
+
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+# License: Apache 2.0
+
+# This script prepares the 1996 English Broadcast News (HUB4) corpus.
+# /export/corpora/LDC/LDC97S44 
+# /export/corpora/LDC/LDC97T22
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset             # Treat unset variables as an error
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <text-source> <speech-source> <out-dir>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC97T22/hub4_eng_train_trans /export/corpora/LDC/LDC97S44/data data/local/data/train_bn96"
+  exit 1
+fi
+
+text_source_dir=$1    # /export/corpora/LDC/LDC97T22/hub4_eng_train_trans
+speech_source_dir=$2  # /export/corpora/LDC/LDC97S44/data
+out=$3
+
+mkdir -p $out;
+
+ls $text_source_dir/*/*.txt > $out/text.list
+ls $speech_source_dir/*.sph > $out/audio.list
+
+if [ ! -s $out/text.list ] || [ ! -s $out/audio.list ]; then
+  echo "$0: Could not get text and audio files"
+  exit 1
+fi
+
+local/hub4_96_parse_sgm.pl $out/text.list > \
+  $out/transcripts.txt 2> $out/parse_sgml.log || exit 1
+
+if [ ! -s $out/transcripts.txt ]; then
+  echo "$0: Could not parse SGML files in $out/text.list"
+  exit 1
+fi
+
+echo "$0: 1996 English Broadcast News training data (HUB4) prepared in $out"
+exit 0
diff --git a/egs/multi_en/s5/local/hub4_96_parse_sgm.pl b/egs/multi_en/s5/local/hub4_96_parse_sgm.pl
new file mode 100755
index 00000000000..172ec5bb563
--- /dev/null
+++ b/egs/multi_en/s5/local/hub4_96_parse_sgm.pl
@@ -0,0 +1,235 @@
+#!/usr/bin/env perl
+###########################################################################################
+# This script was copied from egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl
+# The source commit was 9f61a1b0efa76f37fc29fa2dbeede6dc776a0203
+# No change was made
+###########################################################################################
+
+#===============================================================================
+# Copyright (c) 2017  Johns Hopkins University (Author: Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+require HTML::Parser or die "This script needs HTML::Parser from CPAN";
+HTML::Parser->import();
+
+binmode(STDOUT, ":utf8");
+
+sub  trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s };
+
+sub parse_sgml_tag {
+  my $tag = shift(@_);
+  my %ret;
+  
+  if ($tag !~ /=/) {
+    return %ret;
+  }
+  
+  $tag =~ s/<[a-zA-Z]+ //;
+  $tag =~ s/> *$//;
+  #print $tag . "\n";
+
+  my @key_value_pairs = split / *,? +/, $tag;
+  for my $entry(@key_value_pairs) {
+    (my $key, my $value) = split '=', $entry, 2;
+    $ret{$key}=$value;
+  }
+  return %ret;
+}
+
+if (@ARGV != 1) {
+  print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n";
+  print STDERR "  Usage: $0 <transripts>\n";
+  print STDERR "  where\n";
+  print STDERR "    <transcripts> is a file containing the official SGML format\n";
+  print STDERR "      transcripts. The files are parsed and the parsed representation\n";
+  print STDERR "      is dumped to STDOUT (one utterance + the additional data fields\n";
+  print STDERR "      per line (we dump all the fields, but not all fields are used\n";
+  print STDERR "      in the recipe).\n";
+  die;
+}
+my $filelist=$ARGV[0];
+
+my $p = HTML::Parser->new();
+
+my @files=();
+open(F, '<', $filelist) or die "Could not open file $filelist: $?\n";
+while(<F>) {
+  chomp;
+  push @files, $_;
+}
+
+foreach my $file (@files) {
+  my $reporter="";
+  my $start = -1;
+  my $end = -1;
+  my $segment_start = -1;
+  my $segment_end = -1;
+  my $segment_speaker;
+  my $segment_fidelity = "XXX";
+  my $segment_mode = "XXX";
+  my $section_start = -1;
+  my $section_end = -1;
+  my $filename = "";
+  my $seq = 0;
+  my @text = ();
+  my $time;
+  my @tagqueue;
+
+  my $sgml_file = `basename $file`;
+  $sgml_file = trim $sgml_file;
+  $sgml_file =~ s/\.txt$//g;
+  $sgml_file =~ s/\.sgml$//g;
+  $sgml_file =~ s/_$//g;
+
+  open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n";
+
+  while(my $line = <$f>) {
+    chomp $line;
+    $line = trim $line;
+    $line = lc $line;
+    next unless $line;
+
+    if ($line =~ /<episode/) {
+      my %tags = parse_sgml_tag $line;
+      $filename = $tags{'filename'};
+      $filename =~ s/"//g;
+      $filename =~ s/\.sph//g;
+
+      if ($sgml_file ne $filename) {
+        print STDERR "$0: WARNING: SGML filename does not match episode filename $filename in file $file\n";
+      }
+      #print "BS: $line\n";
+      push @tagqueue, ["episode", \%tags];
+      ;
+    } elsif ($line =~ /<\/episode/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<section/) {
+      my %tags = parse_sgml_tag $line;
+
+      if ($tags{'s_time'}) {
+        $section_start = $tags{'s_time'};
+      } else {
+        $section_start = $tags{'starttime'};
+      }
+
+      if ($tags{'e_time'}) {
+        $section_end = $tags{'e_time'};
+      } else {
+        $section_end = $tags{'endtime'};
+      }
+
+      #print "BS: $line\n";
+      push @tagqueue, ["section", \%tags];
+      ;
+    } elsif ($line =~ /<\/section/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<segment/) { 
+      #print "BT: $line\n";
+      my %tags = parse_sgml_tag $line;
+      $segment_speaker = $tags{'speaker'};
+      $segment_speaker =~ s/"//g;
+      $segment_start = $tags{'s_time'};
+      $segment_end = $tags{'e_time'};
+      $segment_fidelity = $tags{'fidelity'} if $tags{'fidelity'};
+      $segment_mode = $tags{'mode'} if $tags{'mode'};
+      $time = $segment_start;
+      push @tagqueue, ["segment", \%tags];
+      ;
+    } elsif ($line =~ /<\/segment/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+
+      #print join(" ", @text) . "\n" if @text > 0;
+      my $new_time = $segment_end;
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = 0;
+      $segment_speaker = "XXX";
+      $segment_start = "XXX";
+      $segment_end = "XXX";
+      $segment_fidelity = "XXX";
+      $segment_mode = "XXX";
+      #print "ET: $line\n";
+      ;
+    } elsif ($line =~ /<sync/) {
+      my %tags = parse_sgml_tag $line;
+      my $new_time = $tags{'time'};
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = $new_time;
+      ;
+    } elsif ($line =~ /<\/sync/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<\/overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<background/ || $line =~ /<comment/) {
+      # ignore line;
+    } elsif ($line =~ /<foreign/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/foreign/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<unclear/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/unclear/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<[^\/]/) {
+      parse_sgml_tag $line;
+      print STDERR "$0: INFO: Unknown tag $line in file $file\n";
+    } elsif ($line =~ /<\//) {
+      ;
+    } else {
+      $line = trim $line;
+      push @text, $line if $line;
+      ;
+    }
+
+  }
+  close($f);
+}
diff --git a/egs/multi_en/s5/local/hub4_97_data_prep.sh b/egs/multi_en/s5/local/hub4_97_data_prep.sh
new file mode 100755
index 00000000000..096c2142c36
--- /dev/null
+++ b/egs/multi_en/s5/local/hub4_97_data_prep.sh
@@ -0,0 +1,50 @@
+###########################################################################################
+# This script was copied from egs/hub4_english/s5/local/data_prep/prepare_1997_bn_data.sh
+# The source commit was 191ae0a6e5db19d316c82a78c746bcd56cc2d7da
+# No change was made
+###########################################################################################
+
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+# License: Apache 2.0
+
+# This script prepares the 1997 English Broadcast News (HUB4) corpus.
+# /export/corpora/LDC/LDC98S71 
+# /export/corpora/LDC/LDC98T28
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset             # Treat unset variables as an error
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <text-source> <speech-source> <out-dir>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC98T28/hub4e97_trans_980217 /export/corpora/LDC/LDC98S71/97_eng_bns_hub4 data/local/data/train_bn97"
+  exit 1
+fi
+
+text_source_dir=$1    # /export/corpora/LDC/LDC98T28/hub4e97_trans_980217
+speech_source_dir=$2  # /export/corpora/LDC/LDC98S71/97_eng_bns_hub4
+out=$3
+
+mkdir -p $out;
+
+ls $text_source_dir/transcrp/*.sgml > $out/text.list
+ls $speech_source_dir/*.sph > $out/audio.list
+
+if [ ! -s $out/text.list ] || [ ! -s $out/audio.list ]; then
+  echo "$0: Could not get text and audio files"
+  exit 1
+fi
+
+local/hub4_97_parse_sgm.pl $out/text.list > \
+  $out/transcripts.txt 2> $out/parse_sgml.log || exit 1
+
+if [ ! -s $out/transcripts.txt ]; then
+  echo "$0: Could not parse SGML files in $out/text.list"
+  exit 1
+fi
+
+echo "$0: 1997 English Broadcast News training data (HUB4) prepared in $out"
+exit 0
diff --git a/egs/multi_en/s5/local/hub4_97_parse_sgm.pl b/egs/multi_en/s5/local/hub4_97_parse_sgm.pl
new file mode 100755
index 00000000000..da2344df7c7
--- /dev/null
+++ b/egs/multi_en/s5/local/hub4_97_parse_sgm.pl
@@ -0,0 +1,235 @@
+#!/usr/bin/env perl
+###########################################################################################
+# This script was copied from egs/hub4_english/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl
+# The source commit was 191ae0a6e5db19d316c82a78c746bcd56cc2d7da
+# No change was made
+###########################################################################################
+
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright (c) 2017  Johns Hopkins University (Author: Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+require HTML::Parser or die "This script needs HTML::Parser from CPAN";
+HTML::Parser->import();
+
+binmode(STDOUT, ":utf8");
+
+sub  trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s };
+
+sub parse_sgml_tag {
+  my $tag = shift(@_);
+  my %ret;
+  
+  if ($tag !~ /=/) {
+    return %ret;
+  }
+  
+  $tag =~ s/<[a-zA-Z]+ //;
+  $tag =~ s/> *$//;
+  #print $tag . "\n";
+
+  my @key_value_pairs = split / *,? +/, $tag;
+  for my $entry(@key_value_pairs) {
+    (my $key, my $value) = split '=', $entry, 2;
+    $ret{$key}=$value;
+  }
+  return %ret;
+}
+
+if (@ARGV != 1) {
+  print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n";
+  print STDERR "  Usage: $0 <transripts>\n";
+  print STDERR "  where\n";
+  print STDERR "    <transcripts> is a file containing the official SGML format\n";
+  print STDERR "      transcripts. The files are parsed and the parsed representation\n";
+  print STDERR "      is dumped to STDOUT (one utterance + the additional data fields\n";
+  print STDERR "      per line (we dump all the fields, but not all fields are used\n";
+  print STDERR "      in the recipe).\n";
+  die;
+}
+my $filelist=$ARGV[0];
+
+my $p = HTML::Parser->new();
+
+my @files=();
+open(F, '<', $filelist) or die "Could not open file $filelist: $?\n";
+while(<F>) {
+  chomp;
+  push @files, $_;
+}
+
+foreach my $file (@files) {
+  my $reporter="";
+  my $start = -1;
+  my $end = -1;
+  my $segment_start = -1;
+  my $segment_end = -1;
+  my $segment_speaker;
+  my $segment_fidelity = "XXX";
+  my $segment_mode = "XXX";
+  my $section_start = -1;
+  my $section_end = -1;
+  my $filename = "";
+  my $seq = 0;
+  my @text = ();
+  my $time;
+  my @tagqueue;
+
+  my $sgml_file = `basename $file`;
+  $sgml_file = trim $sgml_file;
+  $sgml_file =~ s/\.txt$//g;
+  $sgml_file =~ s/\.sgml$//g;
+  $sgml_file =~ s/_$//g;
+
+  open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n";
+
+  while(my $line = <$f>) {
+    chomp $line;
+    $line = trim $line;
+    $line = lc $line;
+    next unless $line;
+
+    if ($line =~ /<episode/) {
+      my %tags = parse_sgml_tag $line;
+      $filename = $tags{'filename'};
+      $filename =~ s/"//g;
+      $filename =~ s/\.sph//g;
+
+      if ($sgml_file ne $filename) {
+        print STDERR "$0: WARNING: SGML filename does not match episode filename $filename in file $file\n";
+      }
+      #print "BS: $line\n";
+      push @tagqueue, ["episode", \%tags];
+      ;
+    } elsif ($line =~ /<\/episode/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<section/) {
+      my %tags = parse_sgml_tag $line;
+
+      if ($tags{'s_time'}) {
+        $section_start = $tags{'s_time'};
+      } else {
+        $section_start = $tags{'starttime'};
+      }
+
+      if ($tags{'e_time'}) {
+        $section_end = $tags{'e_time'};
+      } else {
+        $section_end = $tags{'endtime'};
+      }
+
+      #print "BS: $line\n";
+      push @tagqueue, ["section", \%tags];
+      ;
+    } elsif ($line =~ /<\/section/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<turn/) { 
+      #print "BT: $line\n";
+      my %tags = parse_sgml_tag $line;
+      $segment_speaker = $tags{'speaker'};
+      $segment_speaker =~ s/"//g;
+      $segment_start = $tags{'starttime'};
+      $segment_end = $tags{'endtime'};
+      $segment_fidelity = $tags{'fidelity'} if $tags{'fidelity'};
+      $segment_mode = $tags{'mode'} if $tags{'mode'};
+      $time = $segment_start;
+      push @tagqueue, ["turn", \%tags];
+      ;
+    } elsif ($line =~ /<\/turn/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+
+      #print join(" ", @text) . "\n" if @text > 0;
+      my $new_time = $segment_end;
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = 0;
+      $segment_speaker = "XXX";
+      $segment_start = "XXX";
+      $segment_end = "XXX";
+      $segment_fidelity = "XXX";
+      $segment_mode = "XXX";
+      #print "ET: $line\n";
+      ;
+    } elsif ($line =~ /<time/) {
+      my %tags = parse_sgml_tag $line;
+      my $new_time = $tags{'sec'};
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = $new_time;
+      ;
+    } elsif ($line =~ /<\/time/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<\/overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<background/ || $line =~ /<comment/) {
+      # ignore line;
+    } elsif ($line =~ /<foreign/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/foreign/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<unclear/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/unclear/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<[^\/]/) {
+      parse_sgml_tag $line;
+      print STDERR "$0: INFO: Unknown tag $line in file $file\n";
+    } elsif ($line =~ /<\//) {
+      ;
+    } else {
+      $line = trim $line;
+      push @text, $line if $line;
+      ;
+    }
+  }
+  close($f);
+}
diff --git a/egs/multi_en/s5/local/hub4_data_prep.py b/egs/multi_en/s5/local/hub4_data_prep.py
deleted file mode 100755
index bc813cfbadd..00000000000
--- a/egs/multi_en/s5/local/hub4_data_prep.py
+++ /dev/null
@@ -1,242 +0,0 @@
-#! /usr/bin/env python
-
-# Copyright 2016    Vimal Manohar
-# Apache 2.0.
-
-"""This script prepares the 1996 English Broadcast News (HUB4) corpus.
-https://catalog.ldc.upenn.edu/LDC97S44
-https://catalog.ldc.upenn.edu/LDC97T22
-"""
-
-from __future__ import print_function
-import argparse
-import glob
-import logging
-import os
-import re
-from bs4 import BeautifulSoup
-import hub4_utils
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler()
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
-                              "%(funcName)s - %(levelname)s ] %(message)s")
-handler.setFormatter(formatter)
-logger.addHandler(handler)
-
-
-def get_args():
-    parser = argparse.ArgumentParser("Prepare BN corpus.")
-    parser.add_argument("--noise-word", type=str, default="<NOISE>",
-                        help="""Replace all noise words in transcript
-                        with this noise_word""")
-    parser.add_argument("--spoken-noise-word", type=str,
-                        default="<SPOKEN_NOISE>",
-                        help="""Replace all speaker noise words in transcript
-                        with this spoken_noise_word""")
-    parser.add_argument("--split-at-sync", type=str,
-                        choices=["true", "false"], default="false",
-                        help="If true, creates separate segments split "
-                        "at each sync tag.")
-    parser.add_argument("audio_source_dir", type=str,
-                        help="Source directory of audio of BN corpus "
-                        "(LDC97S44)")
-    parser.add_argument("text_source_dir", type=str,
-                        help="Source directory of text of BN corpus "
-                        "(LDC97T22)")
-    parser.add_argument("dir", type=str,
-                        help="Output directory to write the kaldi files")
-
-    args = parser.parse_args()
-
-    args.split_at_sync = bool(args.split_at_sync == "true")
-    return args
-
-
-class Segment(object):
-    """A class to store a segment with start time, end time, recording id,
-    speaker, and the text.
-    """
-    def __init__(self, reco_id, speaker=None):
-        self.reco_id = reco_id
-        self.text = None
-        self.start_time = -1
-        self.end_time = -1
-        self.speaker = speaker
-
-    def write_segment(self, out_file):
-        """writes segment in kaldi segments format"""
-        print("{0} {1} {2} {3}".format(self.get_utt_id(), self.reco_id,
-                                       self.start_time, self.end_time),
-              file=out_file)
-
-    def write_utt2spk(self, out_file):
-        """writes speaker information in kaldi utt2spk format"""
-        print("{0} {1}".format(self.get_utt_id(), self.get_spk_id()),
-              file=out_file)
-
-    def write_text(self, out_file, noise_word="<NOISE>",
-                   spoken_noise_word="<SPOKEN_NOISE_WORD>"):
-        text = hub4_utils.normalize_bn_transcript(
-            self.text, noise_word, spoken_noise_word)
-        if len(text) == 0 or re.match(r"^\s*$", text):
-            return
-        print("{0} {1}".format(self.get_utt_id(), text), file=out_file)
-
-    def check(self):
-        """checks if this is a valid segment"""
-        assert self.end_time > self.start_time
-
-    def get_utt_id(self):
-        """returns the utterance id created from the recording id and
-        the timing information"""
-        if self.speaker is None:
-            return ("{0}-{1:06d}-{2:06d}".format(
-                self.reco_id, int(self.start_time * 100),
-                int(self.end_time * 100)))
-        else:
-            return ("{0}-{1:06d}-{2:06d}".format(
-                self.get_spk_id(), int(self.start_time * 100),
-                int(self.end_time * 100)))
-
-    def get_spk_id(self):
-        if self.speaker is None:
-            return ("{0}-{1:06d}-{2:06d}".format(
-                self.reco_id, int(self.start_time * 100),
-                int(self.end_time * 100)))
-        return "{0}-{1}".format(self.reco_id, self.speaker)
-
-    def duration(self):
-        """returns the duration of the segment"""
-        return self.end_time - self.start_time
-
-
-def process_segment_soup(reco_id, soup, split_at_sync=False):
-    """Processes the input segment soup into a list of objects of class
-    Segment.
-    If split_at_sync is False, then only a segment is created for the soup
-    without consideration to the sync tags.
-    """
-    start_time = float(soup['s_time'])
-    end_time = float(soup['e_time'])
-    speaker = soup['speaker']
-
-    segments = []
-
-    create_new_segment = True
-    for x in soup.children:
-        try:
-            if x.name == "sync":
-                assert not create_new_segment
-                if not split_at_sync:
-                    continue
-                start_time = float(x['time'])
-                segments[-1].end_time = start_time
-                create_new_segment = True
-            elif x.name == "background" or x.name == "comment":
-                continue
-            else:
-                if create_new_segment:
-                    assert split_at_sync or len(segments) == 0
-                    segment = Segment(reco_id, speaker)
-                    segment.text = x.encode('ascii').strip().replace('\n', ' ')
-                    segment.start_time = start_time
-                    segment.end_time = end_time
-                    if segment.duration() > 0:
-                        segments.append(segment)
-                    create_new_segment = False
-                else:
-                    segments[-1].text += (
-                        ' ' + x.encode('ascii').strip().replace('\n', ' '))
-        except Exception:
-            logger.error("Error processing element %s", x)
-            raise
-
-    return segments
-
-
-def process_transcription(transcription_file, segments_handle, utt2spk_handle,
-                          text_handle, split_at_sync=False,
-                          noise_word="<NOISE>",
-                          spoken_noise_word="<SPOKEN_NOISE>"):
-    """Processes transcription file into segments."""
-    doc = ''.join(open(transcription_file).readlines())
-    tag_matcher = re.compile(r"(<(Sync|Background)[^>]+>)")
-    doc_modified = tag_matcher.sub(r"\1</\2>", doc)
-
-    soup = BeautifulSoup(doc_modified, 'lxml')
-
-    reco_id, ext = os.path.splitext(os.path.basename(transcription_file))
-    reco_id = reco_id.strip('_')  # remove trailing underscores in the name
-
-    for episode in soup.find_all("episode"):
-        for section in episode.find_all("section"):
-            s_time = section['s_time']
-            e_time = section['e_time']
-            section_type = section['type']
-
-            logger.debug("Processing section st = %d, end = %d, "
-                         "type = %s", s_time, e_time, section_type)
-
-            for seg in section.find_all("segment"):
-                try:
-                    segments = process_segment_soup(
-                        reco_id, seg, split_at_sync=split_at_sync)
-                    for s in segments:
-                        if s.duration() == 0:
-                            continue
-                        s.write_segment(segments_handle)
-                        s.write_utt2spk(utt2spk_handle)
-                        s.write_text(text_handle, noise_word,
-                                     spoken_noise_word)
-                except Exception:
-                    logger.error("Failed processing segment %s", seg)
-                    raise
-
-
-def run(args):
-    if not os.path.isdir(args.dir):
-        os.makedirs(args.dir)
-
-    with open(os.path.join(args.dir, "wav.scp"), 'w') as wav_scp_handle:
-        for file_ in glob.glob("{0}/{1}/*.sph".format(args.audio_source_dir,
-                                                      "data")):
-            reco, ext = os.path.splitext(os.path.basename(file_))
-            reco = reco.strip('_')
-
-            print("{0} sox {1} -c 1 -r 8000 -t wav - |".format(
-                reco, file_), file=wav_scp_handle)
-
-    segments_handle = open(os.path.join(args.dir, "segments"), 'w')
-    utt2spk_handle = open(os.path.join(args.dir, "utt2spk"), 'w')
-    text_handle = open(os.path.join(args.dir, "text"), 'w')
-    for dir_ in glob.glob("{0}/{1}/*/".format(args.text_source_dir,
-                                              "hub4_eng_train_trans")):
-        for x in glob.glob("{0}/*.txt".format(dir_)):
-            try:
-                process_transcription(x, segments_handle, utt2spk_handle,
-                                      text_handle,
-                                      split_at_sync=args.split_at_sync,
-                                      noise_word=args.noise_word,
-                                      spoken_noise_word=args.spoken_noise_word)
-            except Exception:
-                logger.error("Failed to process file %s",
-                             x)
-                raise
-    segments_handle.close()
-    utt2spk_handle.close()
-    text_handle.close()
-
-
-def main():
-    try:
-        args = get_args()
-        run(args)
-    except Exception:
-        raise
-
-
-if __name__ == '__main__':
-    main()
diff --git a/egs/multi_en/s5/local/hub4_en_data_prep.sh b/egs/multi_en/s5/local/hub4_en_data_prep.sh
new file mode 100755
index 00000000000..e8173111038
--- /dev/null
+++ b/egs/multi_en/s5/local/hub4_en_data_prep.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# 1996/1997 English Broadcast News training data preparation (HUB4)
+
+# Copyright  2017  Xiaohui Zhang
+#            2017  Vimal Manohar
+# Apache 2.0.
+if [ $# != 4 ]; then
+   echo "Arguments should be a list of HUB4 directories, see ../run.sh for example."
+   exit 1;
+fi
+
+hub4_96_train_transcripts=$1
+hub4_96_train_speech=$2
+hub4_97_train_transcripts=$3
+hub4_97_train_speech=$4
+
+. ./path.sh # Needed for KALDI_ROOT
+###############################################################################
+# Prepare 1996 English Broadcast News Train (HUB4)
+###############################################################################
+local/hub4_96_data_prep.sh \
+  $hub4_96_train_transcripts \
+  $hub4_96_train_speech \
+  data/local/train_bn96
+
+###############################################################################
+# Prepare 1996 English Broadcast News Train (HUB4)
+###############################################################################
+local/hub4_97_data_prep.sh \
+  $hub4_97_train_transcripts \
+  $hub4_97_train_speech \
+  data/local/train_bn97
+
+###############################################################################
+# Format 1996 English Broadcast News Train (HUB4)
+###############################################################################
+mkdir -p data/hub4_en/train_bn96
+
+local/hub4_format_data.pl \
+  data/local/train_bn96/audio.list data/local/train_bn96/transcripts.txt \
+  data/hub4_en/train_bn96 || exit 1
+
+mv data/hub4_en/train_bn96/text data/hub4_en/train_bn96/text.unnorm
+local/hub4_normalize_bn96_transcripts.pl "<noise>" "<spoken_noise>" \
+  < data/hub4_en/train_bn96/text.unnorm > data/hub4_en/train_bn96/text
+
+###############################################################################
+# Format 1997 English Broadcast News Train (HUB4)
+###############################################################################
+mkdir -p data/hub4_en/train_bn97
+
+local/hub4_format_data.pl \
+  data/local/train_bn97/audio.list data/local/train_bn97/transcripts.txt \
+  data/hub4_en/train_bn97 || exit 1
+
+mv data/hub4_en/train_bn97/text data/hub4_en/train_bn97/text.unnorm
+local/hub4_normalize_bn97_transcripts.pl "<noise>" "<spoken_noise>" \
+  < data/hub4_en/train_bn97/text.unnorm > data/hub4_en/train_bn97/text
+
+# Combine 1996/1997 BN data
+utils/combine_data.sh data/hub4_en/train data/hub4_en/train_bn96 data/hub4_en/train_bn97
diff --git a/egs/multi_en/s5/local/hub4_format_data.pl b/egs/multi_en/s5/local/hub4_format_data.pl
new file mode 100755
index 00000000000..03abe3ff2c5
--- /dev/null
+++ b/egs/multi_en/s5/local/hub4_format_data.pl
@@ -0,0 +1,138 @@
+#!/usr/bin/env perl
+
+###########################################################################################
+# This script was copied from egs/hub4_english/s5/local/data_prep/format_1996_bn_data.pl
+# The source commit was 9f61a1b0efa76f37fc29fa2dbeede6dc776a0203
+# Minor script name changes in Usage info were made.
+###########################################################################################
+
+#===============================================================================
+# Copyright (c) 2017  Johns Hopkins University 
+#                        (Author: Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+use List::Util qw(max);
+
+my $audio_width=1;
+my $speaker_width=1;
+my $time_width=1;
+
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+
+if (@ARGV != 3) {
+  print STDERR "$0: Error: Unsupported number of arguments: " . scalar @ARGV ."\n";
+  print STDERR "  Usage: $0 <audio-files> <transripts> <destination>\n";
+  print STDERR "  where\n";
+  print STDERR "    <audio-files> is a file containing list of audio files\n";
+  print STDERR "      (single absolute path name per line)\n";
+  print STDERR "    <transcripts> is a file containing transcripts obtained\n";
+  print STDERR "      obtained by processing the official SGML format\n";
+  print STDERR "      transcripts. See local/hub4_{96,97}_parse_sgm.pl for further info.\n";
+  print STDERR "    <destination> target directory (should already exist)\n";
+  print STDERR "  See also: local/hub4_{96,97}_parse_sgm.pl\n";
+  die;
+}
+
+my $audio_files = $ARGV[0];
+my $transcripts = $ARGV[1];
+my $out = $ARGV[2];
+
+my %AUDIO;
+open(my $audio_f, "<", $audio_files) 
+  or die "$0: Error: Could not open $audio_files: $!\n";
+while(my $line = <$audio_f>) {
+  chomp $line;
+  (my $basename = $line) =~ s/.*\/([^\/]+).sph/$1/g;
+  $basename =~ s/_$//g;
+  $AUDIO{$basename} = $line;
+}
+close($audio_f);
+
+my %TRANSCRIPT;
+open(my $transcript_f, "<:encoding(utf-8)", $transcripts)
+  or die "$0: Error: Could not open $transcripts: $!\n";
+while(my $line = <$transcript_f>) {
+  chomp $line;
+  my @F = split / /, $line, 8;
+  push @{$TRANSCRIPT{$F[0]}}, \@F;
+
+  my $f1 = $F[0];
+  my $f2 = $F[1];
+  my $speaker = $F[2];
+  my $t1 = $F[5];
+  my $t2 = $F[6];
+
+  $time_width = max $time_width, length($t1), length($t2);
+  $speaker_width = max $speaker_width, length($speaker);
+  $audio_width = max $audio_width, length($f1);
+}
+close($transcript_f);
+#print Dumper(\%TRANSCRIPT);
+
+print STDERR $time_width . " " . $speaker_width . " " . $audio_width . "\n";
+
+my $sph2pipe = `which sph2pipe` or do {
+  die "$0: Error: sph2pipe is not installed. Did you run make in the tools/ directory?\n";
+};
+chomp $sph2pipe;
+
+open(my $wav_file, ">", "$out/wav.scp") 
+  or die "$0: Error: Cannot create file $out/wav.scp: $!\n";
+open(my $text_file, ">:encoding(utf-8)", "$out/text") 
+  or die "$0: Error: Cannot create file $out/text: $!\n";
+open(my $segments_file, ">", "$out/segments") 
+  or die "$0: Error: Cannot create file $out/segments: $!\n";
+open(my $spk_file, ">", "$out/utt2spk") 
+  or die "$0: Error: Cannot create file $out/utt2spk: $!\n";
+
+foreach my $file (sort keys %AUDIO) {
+  print "$0 Error: $file does not exist in transcripts!\n"  
+    unless exists $TRANSCRIPT{$file};
+  my $transcripts = $TRANSCRIPT{$file};
+
+  #my $file_fmt = sprintf("%0${audio_width}s", $file);
+  my $file_fmt = sprintf("%s", $file);
+
+  print $wav_file "$file_fmt $sph2pipe -f wav $AUDIO{$file}|\n";
+
+  foreach my $utt (@{$transcripts}) {
+    my $start = $utt->[5] + 0.0;  
+    my $end = $utt->[6] + 0.0;
+    if ($end - $start < 0.005) {   # remove very short segments
+      next;
+    }
+    my $start_time = sprintf("%0${time_width}d", $utt->[5]*1000);  
+    my $end_time = sprintf("%0${time_width}d", $utt->[6]*1000);
+    my $spk = sprintf("%0${speaker_width}s", $utt->[2]);
+    # my $spk = sprintf("%s", $utt->[2]);
+    my $spkid = "${file_fmt}_${spk}";
+    my $uttid = "${file_fmt}_${spk}_${start_time}_${end_time}";
+
+    print $text_file "$uttid $utt->[7]\n";
+    print $spk_file "$uttid $spkid\n";
+    print $segments_file "$uttid $file_fmt $start $end\n";
+  }
+}
+
+close($wav_file);
+close($text_file);
+close($segments_file);
+close($spk_file);
diff --git a/egs/multi_en/s5/local/hub4_normalize_bn96_transcripts.pl b/egs/multi_en/s5/local/hub4_normalize_bn96_transcripts.pl
new file mode 100755
index 00000000000..8f092cb717f
--- /dev/null
+++ b/egs/multi_en/s5/local/hub4_normalize_bn96_transcripts.pl
@@ -0,0 +1,33 @@
+#!/usr/bin/env perl
+
+###########################################################################################
+# This script was copied from egs/hub4_english/s5/local/normalize_bn96_transcripts.pl
+# The source commit was 9f61a1b0efa76f37fc29fa2dbeede6dc776a0203
+# No change was made
+###########################################################################################
+
+# Copyright 2017  Vimal Manohar
+# Apache 2.0
+
+@ARGV == 2 ||  die "usage: normalize_transcript.pl noise_word spoken_noise_word < transcript > transcript2";
+$noise_word = shift @ARGV;
+$spoken_noise_word = shift @ARGV;
+
+while(<STDIN>) {
+    $_ =~ m:^(\S+) (.+): || die "bad line $_";
+    $utt = $1;
+    $trans = $2;
+    print "$utt";
+
+    $trans =~ s:\(\(([^)]*)\)\):$1 :g;   # Remove unclear speech markings
+    $trans =~ s:#: :g; # Remove overlapped speech markings
+    $trans =~ s:\*\*([^*]+)\*\*:$1 :g;       # Remove invented word markings
+    $trans =~ s:\[[^]]+\]:$noise_word :g; 
+    $trans =~ s:\{[^}]+\}:$spoken_noise_word :g;
+    $trans =~ s:^[+]([^+]+)[+]$:$1:;   # Remove mispronunciation brackets
+    foreach $w (split (" ",$trans)) {
+        $w =~ s:^@(.*)$:$1:;  # Remove best guess marking for proper nouns
+        print " $w";
+    }
+    print "\n";
+}
diff --git a/egs/multi_en/s5/local/hub4_normalize_bn97_transcripts.pl b/egs/multi_en/s5/local/hub4_normalize_bn97_transcripts.pl
new file mode 100755
index 00000000000..1f23ae15fda
--- /dev/null
+++ b/egs/multi_en/s5/local/hub4_normalize_bn97_transcripts.pl
@@ -0,0 +1,42 @@
+#!/usr/bin/env perl
+
+###########################################################################################
+# This script was copied from egs/hub4_english/s5/local/normalize_bn97_transcripts.pl
+# The source commit was 148c060d8593386ee29cfcef8a2a0a050c67bce6
+# No change was made
+###########################################################################################
+
+# Copyright 2017  Vimal Manohar
+# Apache 2.0
+
+@ARGV == 2 ||  die "usage: normalize_bn97_transcripts.pl noise_word spoken_noise_word < transcript > transcript2";
+$noise_word = shift @ARGV;
+$spoken_noise_word = shift @ARGV;
+
+while(<STDIN>) {
+    $_ =~ m:^(\S+) (.+): || die "bad line $_";
+    $utt = $1;
+    $trans = $2;
+    print "$utt";
+
+    $trans =~ tr:a-z:A-Z:;
+    $trans =~ s:\(\(([^)]*)\)\):$1 :g;   # Remove unclear speech markings
+    $trans =~ s:#: :g; # Remove overlapped speech markings
+    $trans =~ s:\*\*([^*]+)\*\*:$1 :g;       # Remove invented word markings
+    $trans =~ s:\[[^]]+\]:$noise_word :g; 
+    $trans =~ s:\{[^}]+\}:$spoken_noise_word :g;
+    $trans =~ s:^[+]([^+]+)[+]$:$1:;   # Remove mispronunciation brackets
+    foreach $w (split (" ",$trans)) {
+        if ($w ne $noise_word && $w ne $spoken_noise_word) {
+          $w =~ s:[?.,!]+$::;   # Remove punctuations
+          $w =~ s:^@(.*)$:$1:;  # Remove best guess marking for proper nouns
+          $w =~ s:^[\^](.*)$:$1:;  # Remove capitalization marks
+          $w =~ s:_([A-Z])'S$:$1.'S :g;  # Normalize abbreviations from _f_b_i to f. b. i.
+          $w =~ s:_([A-Z]):$1. :g;  # Normalize abbreviations from _f_b_i to f. b. i.
+          $w =~ s:[ ]+$::;  # Remove trailing spaces
+        }
+
+        print " $w";
+    }
+    print "\n";
+}
diff --git a/egs/multi_en/s5/local/hub4_utils.py b/egs/multi_en/s5/local/hub4_utils.py
deleted file mode 100644
index b43de80c73b..00000000000
--- a/egs/multi_en/s5/local/hub4_utils.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# Copyright 2016    Vimal Manohar
-# Apache 2.0.
-
-"""This module contains utilities for preparing the HUB4 broadcast news
-evaluation corpora.
-"""
-
-import os
-import re
-import sys
-
-
-def parse_uem_line(reco, line):
-    """This method parses a 'line' from the UEM for recording 'reco'
-    and returns the line converted to kaldi segments format.
-    The format of UEM is
-    <file-id> <channel> <start-time> <end-time>
-
-    We force the channel to be 1 and take the file-id to be the recording-id.
-    """
-    line = line.strip()
-    if len(line) == 0 or line[0:2] == ";;":
-        return None
-    parts = line.split()
-
-    if reco is None:
-        reco = parts[0]
-
-    # The channel ID is expected to be 1.
-    if parts[1] != "1":
-        raise TypeError("Invalid line {0}".format(line))
-
-    start_time = float(parts[2])
-    end_time = float(parts[3])
-
-    utt = "{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100),
-                                       int(end_time * 100))
-    return "{0} {1} {2} {3}".format(utt, reco, start_time, end_time)
-
-
-def parse_cmu_seg_line(line, prepend_reco_to_spk=False):
-    """This line parses a 'line' from the CMU automatic segmentation for
-    recording.
-    The CMU segmentation has the following format:
-    <file> <channel> <speaker> <start-time> <end-time> <condition>
-
-    We force the channel to be 1 and take the file-id to be the recording-id.
-    """
-    line = line.strip()
-    if len(line) == 0 or line[0:2] == ";;":
-        return None
-    parts = line.split()
-
-    # Actually a file, but we assuming 1-1 mapping to recording and force
-    # channel to be 1.
-    reco = parts[0]
-
-    # The channel ID is expected to be 1.
-    if parts[1] != "1":
-        raise TypeError("Invalid line {0}".format(line))
-    spk = parts[2]
-
-    start_time = float(parts[3])
-    end_time = float(parts[4])
-
-    if prepend_reco_to_spk:
-        spk = reco + '-' + spk
-        utt = "{spk}-{0:06d}-{1:06d}".format(int(start_time * 100),
-                                             int(end_time * 100), spk=spk)
-    else:
-        utt = "{spk}-{reco}-{0:06d}-{1:06d}".format(int(start_time * 100),
-                                                    int(end_time * 100),
-                                                    reco=reco, spk=spk)
-
-    segment_line = "{0} {1} {st:.3f} {end:.3f}".format(
-        utt, reco, st=start_time, end=end_time)
-    utt2spk_line = "{0} {1}".format(utt, spk)
-
-    return (segment_line, utt2spk_line)
-
-
-def normalize_bn_transcript(text, noise_word, spoken_noise_word):
-    """Normalize broadcast news transcript for audio."""
-    text = text.upper()
-    # Remove unclear speech markings
-    text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text)
-    text = re.sub(r"#", "", text)   # Remove overlapped speech markings
-    # Remove invented word markings
-    text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
-    text = re.sub(r"\[[^]]+\]", noise_word, text)
-    text = re.sub(r"\{[^}]+\}", spoken_noise_word, text)
-    # Remove mispronunciation brackets
-    text = re.sub(r"\+([^+]+)\+", r"\1", text)
-
-    text1 = []
-    for word in text.split():
-        # Remove best guesses for proper nouns
-        word = re.sub(r"^@(\w+)$", r"\1", word)
-        text1.append(word)
-    return " ".join(text1)
-
-
-def normalize_csr_transcript(text, noise_word, spoken_noise_word):
-    """Normalize broadcast news transcript for audio."""
-    text = text.upper()
-
-    # Remove long event markings
-    text = re.sub(r"\[[^]/]+/\]|\[/[^]/]+\]", "", text)
-    # Remove comments
-    text = re.sub(r"\{\{[^}]*\}\}", "", text)
-    # Replace alternative words with a single one (second alternative)
-    text = re.sub(r"\{[^}/]+/([^}/]+)[^}]*\}", r"\1", text)
-    # Remove partial word completions
-    text = re.sub(r"\([^)]+\)-|-\([^)]+\)", "-", text)
-    # Remove accent marks and diacritics
-    text = re.sub(r"\\[3-8]", "", text)
-
-    # Remove unclear speech markings
-    text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text)
-    text = re.sub(r"#", "", text)   # Remove overlapped speech markings
-    # Remove invented word markings
-    text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
-    # Replace speaker-made noises with <SPOKEN_NOISE>
-    text = re.sub(r"\[INHALING\]|\[COUGH\]|\[THROAT_CLEARING\]|\[SIGN\]",
-                  spoken_noise_word, text)
-    # Replace noise with <NOISE>
-    text = re.sub(r"\[[^]]+\]", noise_word, text)
-    text = re.sub(r"\+([^+]+)\+", r"\1", text)
-
-    # Remove periods after letter.
-    text = re.sub(r"([A-Z])\.( |$)", r"\1 ", text)
-    # Replace \. with .
-    text = re.sub(r"\\.", r".", text)
-
-    text1 = []
-    for word in text.split():
-        if word == spoken_noise_word or word == noise_word:
-            text1.append(word)
-            continue
-
-        # Remove mispronunciation brackets
-        word = re.sub(r"^@(\w+)$", r"\1", word)
-        # Remove everything other than the standard ASCII symbols
-        word = re.sub("[^A-Za-z0-9.' _-]", "", word)
-        text1.append(word)
-    return " ".join(text1)
-
-
-def remove_punctuations(text):
-    """Remove punctuations and some other processing for text sentence."""
-    # Remove HTML new lines that are not end of sentences
-    text1 = re.sub("\n", " ", text)
-
-    # Remove some markers like double dash that are normally used to separate
-    # name titles in newspapers.
-    text1 = re.sub(r"(&[^;]+;|--)", " ", text1)
-
-    # Remove quotation marks
-    text1 = re.sub(r"''|``|\(|\)", " ", text1)
-
-    # Remove everything other than the standard ASCII symbols
-    text1 = re.sub("[^A-Za-z0-9.' _-]", "", text1)
-
-    # Replace multiple .'s with single and then remove isolated '.'
-    text1 = re.sub(r"\.[.]+ ", ".", text1)
-    text1 = re.sub(r" \. ", " ", text1)
-
-    # Remove isolated '-'
-    text1 = re.sub(r" - ", " ", text1)
-
-    # Replace multiple spaces with single.
-    text1 = re.sub(r"[ ]+", " ", text1)
-
-    return text1
diff --git a/egs/multi_en/s5/local/librispeech_data_prep.sh b/egs/multi_en/s5/local/librispeech_data_prep.sh
index cf084df0484..b34072a4f61 100755
--- a/egs/multi_en/s5/local/librispeech_data_prep.sh
+++ b/egs/multi_en/s5/local/librispeech_data_prep.sh
@@ -3,8 +3,7 @@
 ###########################################################################################
 # This script was copied from egs/librispeech/s5/local/data_prep.sh
 # The source commit was e69198c3dc5633f98eb88e1cdf20b2521a598f21
-# Changes made:
-#  - Changed wav.scp to use sox to convert and downsample
+# No change was made.
 ###########################################################################################
 
 # Copyright 2014  Vassil Panayotov
@@ -21,8 +20,8 @@ src=$1
 dst=$2
 
 # all utterances are FLAC compressed
-if ! which sox >&/dev/null; then
-   echo "Please install 'sox' on ALL worker nodes!"
+if ! which flac >&/dev/null; then
+   echo "Please install 'flac' on ALL worker nodes!"
    exit 1
 fi
 
@@ -40,7 +39,7 @@ utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
 spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender
 utt2dur=$dst/utt2dur; [[ -f "$utt2dur" ]] && rm $utt2dur
 
-for reader_dir in $(find $src -mindepth 1 -maxdepth 1 -type d | sort); do
+for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
   reader=$(basename $reader_dir)
   if ! [ $reader -eq $reader ]; then  # not integer.
     echo "$0: unexpected subdirectory name $reader"
@@ -60,8 +59,8 @@ for reader_dir in $(find $src -mindepth 1 -maxdepth 1 -type d | sort); do
       exit 1;
     fi
 
-    find $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
-      awk -v "dir=$chapter_dir" '{printf "%s sox %s/%s.flac -r 8000 -t wavpcm - |\n", $0, dir, $0}' >>$wav_scp|| exit 1
+    find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
+      awk -v "dir=$chapter_dir" '{printf "%s flac -c -d -s %s/%s.flac |\n", $0, dir, $0}' >>$wav_scp|| exit 1
 
     chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
     [ ! -f  $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
diff --git a/egs/multi_en/s5/local/make_partitions.sh b/egs/multi_en/s5/local/make_partitions.sh
index 188060e3258..45c079499b4 100755
--- a/egs/multi_en/s5/local/make_partitions.sh
+++ b/egs/multi_en/s5/local/make_partitions.sh
@@ -87,15 +87,15 @@ if [ $stage -eq 8 ]; then
   ln -nfs tri5a_ali $data_dir/tri5b
 fi
 
-# whole fisher + swbd + tedlilum + wsj + hub4_en + librispeech460 (nodup)
+# whole fisher + swbd + tedlilum + wsj + hub4_en + librispeech960 (nodup)
 if [ $stage -eq 9 ]; then
   utils/combine_data.sh $data_dir/fisher_swbd_tedlium_wsj_hub4_libri960 \
     $data_dir/fisher_swbd_tedlium_wsj_hub4_libri460 data/librispeech_500/train \
     || { echo "Failed to combine data"; exit 1; }
   utils/data/remove_dup_utts.sh 300 $data_dir/fisher_swbd_tedlium_wsj_hub4_libri960 $data_dir/tri5b_ali
-  ln -nfs tri6a $data_dir/tri5b_ali
-  ln -nfs tri6a $data_dir/tri6a_ali
-  ln -nfs tri6a $data_dir/tri6b
+  ln -nfs tri5b_ali $data_dir/tri6a
+  ln -nfs tri5b_ali $data_dir/tri6a_ali
+  ln -nfs tri5b_ali $data_dir/tri6b
 fi
 
 if [ $stage -eq 10 ]; then
diff --git a/egs/multi_en/s5/local/swbd1_data_prep.sh b/egs/multi_en/s5/local/swbd1_data_prep.sh
index ade30c412b3..4c1b6c7a9e6 100755
--- a/egs/multi_en/s5/local/swbd1_data_prep.sh
+++ b/egs/multi_en/s5/local/swbd1_data_prep.sh
@@ -2,11 +2,10 @@
 
 ###########################################################################################
 # This script was copied from egs/fisher_swbd/s5/local/swbd1_data_prep.sh
-# The source commit was e69198c3dc5633f98eb88e1cdf20b2521a598f21
+# The source commit was edb1aae9457f6441a224dbc451bb8c5220dfefc7
 # Changes made:
 #  - Specified path to path.sh
 #  - Modified paths to match multi_en naming conventions
-#  - Deleted acronym formatting step
 ###########################################################################################
 
 # Switchboard-1 training data preparation customized for Edinburgh
@@ -91,6 +90,13 @@ cat $dir/transcripts1.txt \
 # case insensitive
 local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt  > $dir/text  # final transcripts
 
+# format acronyms in text
+python local/swbd_map_acronyms_transcripts.py -i $dir/text -o $dir/text_map \
+  -M data/local/dict_swbd/acronyms.map
+cp $dir/text $dir/text_bk
+mv $dir/text_map $dir/text
+
+
 
 # (1c) Make segment files from transcript
 #segments file format is: utt-id side-id start-time end-time, e.g.:
diff --git a/egs/multi_en/s5/local/tedlium_prepare_data.sh b/egs/multi_en/s5/local/tedlium_prepare_data.sh
index bca37ee173c..22f79b0b117 100755
--- a/egs/multi_en/s5/local/tedlium_prepare_data.sh
+++ b/egs/multi_en/s5/local/tedlium_prepare_data.sh
@@ -6,7 +6,6 @@
 # Changes made:
 #  - Specified path to path.sh
 #  - Modified paths to match multi_en naming conventions
-#  - Changed wav.scp to use sox to convert and downsample
 ###########################################################################################
 
 #
@@ -67,7 +66,7 @@ for set in dev test train; do
   cat $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt
 
   # Prepare 'wav.scp', 'reco2file_and_channel'
-  cat $dir/spk2utt | awk -v set=$set '{ printf("%s sox '"$data_src"'/%s/sph/%s.sph -r 8000 -t wavpcm - |\n", $1, set, $1); }' > $dir/wav.scp
+  cat $dir/spk2utt | awk -v set=$set '{ printf("%s sph2pipe -f wav -p '"$data_src"'/%s/sph/%s.sph |\n", $1, set, $1); }' > $dir/wav.scp
   cat $dir/wav.scp | awk '{ print $1, $1, "A"; }' > $dir/reco2file_and_channel
 
   # Create empty 'glm' file
diff --git a/egs/multi_en/s5/local/wsj_data_prep.sh b/egs/multi_en/s5/local/wsj_data_prep.sh
index 5e50b009839..cc11f179eca 100755
--- a/egs/multi_en/s5/local/wsj_data_prep.sh
+++ b/egs/multi_en/s5/local/wsj_data_prep.sh
@@ -6,7 +6,6 @@
 # Changes made:
 #  - Modified paths to match multi_en naming conventions
 #  - Removed code related to LM creation (happens after utt2spk creation)
-#  - Changed wav.scp to downsample to 8 kHz
 ###########################################################################################
 
 # Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
@@ -136,7 +135,7 @@ done
 
 # Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
 for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
-  awk '{printf("%s '$sph2pipe' -f wav %s | sox -t raw -r 16000 -e signed-integer -b 16 - -t wav -r 8000 - | \n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp
+  awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp
 done
 
 # Make the utt2spk and spk2utt files.
diff --git a/egs/multi_en/s5/run.sh b/egs/multi_en/s5/run.sh
index 76f1a6ccc8e..3b020675c5b 100755
--- a/egs/multi_en/s5/run.sh
+++ b/egs/multi_en/s5/run.sh
@@ -18,7 +18,7 @@ wsj1=
 eval2000=
 rt03=
 
-
+set -e
 # check for kaldi_lm
 which get_word_map.pl > /dev/null
 if [ $? -ne 0 ]; then
@@ -38,7 +38,8 @@ case $(hostname -d) in
     wsj1=/export/corpora5/LDC/LDC94S13B
     eval2000="/export/corpora/LDC/LDC2002S09/hub5e_00 /export/corpora/LDC/LDC2002T43"
     rt03="/export/corpora/LDC/LDC2007S10"
-    hub4_en="/export/corpora/LDC/LDC97S44 /export/corpora/LDC/LDC97T22"
+    hub4_en_96="/export/corpora/LDC/LDC97T22/hub4_eng_train_trans /export/corpora/LDC/LDC97S44/data"
+    hub4_en_97="/export/corpora/LDC/LDC98T28/hub4e97_trans_980217 /export/corpora/LDC/LDC98S71/97_eng_bns_hub4"
     ;;
 esac
 
@@ -50,14 +51,25 @@ srilm_opts="-subset -prune-lowprobs -unk -tolower -order 3"
 
 . utils/parse_options.sh
 
-# prepare corpora data
+# Prepare the basic dictionary (a combination of swbd+CMU+tedlium lexicons) in data/local/dict_combined.
+# and train a G2P model using the combined lexicon
+# in data/local/dict_combined
 if [ $stage -le 1 ]; then
+  # We prepare the basic dictionary in data/local/dict_combined.
+  local/prepare_dict.sh $swbd $tedlium2
+  (
+   local/g2p/train_g2p.sh --stage 0 --silence-phones \
+     "data/local/dict_combined/silence_phones.txt" data/local/dict_combined exp/g2p
+  ) &
+fi
+
+# Prepare corpora data
+if [ $stage -le 2 ]; then
   mkdir -p data/local
   # fisher
   local/fisher_data_prep.sh $fisher
   utils/fix_data_dir.sh data/fisher/train
   # swbd
-  local/swbd1_data_download.sh $swbd
   local/swbd1_data_prep.sh $swbd
   utils/fix_data_dir.sh data/swbd/train
   # librispeech
@@ -71,20 +83,13 @@ if [ $stage -le 1 ]; then
   local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.?
   local/wsj_format_data.sh
   utils/copy_data_dir.sh --spk_prefix wsj_ --utt_prefix wsj_ data/wsj/train_si284 data/wsj/train
-  rm -rf data/wsj/train_si284
+  rm -r data/wsj/train_si284 2>/dev/null
   # hub4_en
-  local/hub4_data_prep.py --noise-word="[NOISE]" \
-    --spoken-noise-word="[VOCALIZED-NOISE]" \
-    $hub4_en data/hub4_en/train
-fi
-
-# prepare standalone eval data
-if [ $stage -le 2 ]; then
-  mkdir -p data/local
-  # eval2000
+  local/hub4_en_data_prep.sh $hub4_en_96 $hub4_en_97
+  # eval2000 (test)
   local/eval2000_data_prep.sh $eval2000
   utils/fix_data_dir.sh data/eval2000/test
-  # rt03
+  # rt03 (test)
   local/rt03_data_prep.sh $rt03
   utils/fix_data_dir.sh data/rt03/test
 fi
@@ -98,17 +103,15 @@ if [ $stage -le 3 ]; then
   done
 fi
 
-# Prepare the dictionary and train G2P model using the combined (CMUDict+Tedlium+swbd) lexicon
-# in data/local/dict_combined, and then synthesize pronounciations for all OOV words 
-# across all training transcripts.
+# Synthesize pronounciations for OOV words across all training transcripts and produce the final lexicon.
 if [ $stage -le 4 ]; then
-  # We prepare the dictionary in data/local/dict_combined.
-  local/prepare_dict.sh $swbd $tedlium2
-  local/g2p/train_g2p.sh --stage 0 --silence-phones "data/local/dict_combined/silence_phones.txt" data/local/dict_combined exp/g2p
+  wait
   dict_dir=data/local/dict_nosp
   mkdir -p $dict_dir
+  rm $dict_dir/lexiconp.txt 2>/dev/null
   cp data/local/dict_combined/{extra_questions,nonsilence_phones,silence_phones,optional_silence}.txt $dict_dir
-  local/g2p/apply_g2p.sh --var-counts 1 exp/g2p/model.fst data/local/g2p_phonetisarus data/local/dict_combined/lexicon.txt $dict_dir/lexicon.txt
+  local/g2p/apply_g2p.sh --var-counts 1 exp/g2p/model.fst data/local/g2p_phonetisarus \
+    data/local/dict_combined/lexicon.txt $dict_dir/lexicon.txt
 fi
 
 # We'll do multiple iterations of pron/sil-prob estimation. So the structure of
@@ -137,19 +140,22 @@ if [ $stage -le 7 ]; then
   mfccdir=mfcc
   corpora="hub4_en fisher librispeech_100 librispeech_360 librispeech_500 swbd tedlium wsj"
   for c in $corpora; do
-    data=data/$c/train
-    steps/make_mfcc.sh --mfcc-config conf/mfcc.conf \
-      --cmd "$train_cmd" --nj 40 \
-      $data exp/make_mfcc/$c/train || exit 1;
-    steps/compute_cmvn_stats.sh \
-      $data exp/make_mfcc/$c/train || exit 1;
+    (
+     data=data/$c/train
+     steps/make_mfcc.sh --mfcc-config conf/mfcc.conf \
+       --cmd "$train_cmd" --nj 40 \
+       $data exp/make_mfcc/$c/train || exit 1;
+     steps/compute_cmvn_stats.sh \
+       $data exp/make_mfcc/$c/train || exit 1;
+    ) &
   done
+  wait
 fi
 
 # fix and validate training data directories
 if [ $stage -le 8 ]; then
   # get rid of spk2gender files because not all corpora have them
-  rm -f data/*/train/spk2gender
+  rm data/*/train/spk2gender 2>/dev/null
   # create reco2channel_and_file files for wsj and librispeech
   for c in wsj librispeech_100 librispeech_360 librispeech_500; do
     awk '{print $1, $1, "A"}' data/$c/train/wav.scp > data/$c/train/reco2file_and_channel;

From 508358fa299b54e3c31d53dce7a699842ca8e233 Mon Sep 17 00:00:00 2001
From: xiaohui-zhang <samuelzhang1104@gmail.com>
Date: Sat, 20 Jan 2018 01:25:33 -0500
Subject: [PATCH 2/7] removed the last GMM training stage which didn't bring
 improvements

---
 egs/multi_en/s5/RESULTS                  | 18 ++++++----
 egs/multi_en/s5/local/make_partitions.sh |  3 +-
 egs/multi_en/s5/run.sh                   | 45 ------------------------
 3 files changed, 12 insertions(+), 54 deletions(-)

diff --git a/egs/multi_en/s5/RESULTS b/egs/multi_en/s5/RESULTS
index 2b7e5329f20..17eb49c3740 100755
--- a/egs/multi_en/s5/RESULTS
+++ b/egs/multi_en/s5/RESULTS
@@ -36,7 +36,7 @@ exit 0
 # multi_a  tri5  tedlium_tg_tedlium.si    ||  %WER 29.0 | 1155 27512 | 75.8 20.3 3.9 4.8 29.0 93.3 | exp/multi_a/tri5/decode_tedlium_tg_tedlium.si/score_11_0.5/test.ctm.filt.sys
 
 # Results with the current data combination, lexicon preparation, and acoustic model training procedures.
-# On eval2000 the final GMM results is 24.3, which is better than the above result (24.9). 
+# On eval2000 the final GMM results is 24.5, which is better than the above result (24.9). 
 multi_a  tri1b  tg_eval2000        ||  %WER 40.4 | 4459 42989 | 63.8 25.9 10.3 4.2 40.4 72.7 | exp/multi_a/tri1b/decode_tg_eval2000/score_10_0.5/eval2000.ctm.filt.sys
 multi_a  tri1b  tg_eval2000.si     ||  %WER 45.0 | 4459 42989 | 59.3 28.8 11.9 4.3 45.0 75.0 | exp/multi_a/tri1b/decode_tg_eval2000.si/score_12_0.0/eval2000.ctm.filt.sys
 multi_a  tri3a  tg_eval2000        ||  %WER 33.4 | 4459 42989 | 70.5 21.3 8.3 3.9 33.4 69.7 | exp/multi_a/tri3a/decode_tg_eval2000/score_11_0.0/eval2000.ctm.filt.sys
@@ -47,9 +47,13 @@ multi_a  tri3b  tg_sp_eval2000     ||  %WER 26.8 | 4459 42989 | 77.0 17.3 5.7 3.
 multi_a  tri3b  tg_sp_eval2000.si  ||  %WER 30.5 | 4459 42989 | 73.7 19.7 6.7 4.2 30.5 68.0 | exp/multi_a/tri3b/decode_tg_sp_eval2000.si/score_11_0.0/eval2000.ctm.filt.sys
 multi_a  tri4   tg_eval2000        ||  %WER 24.8 | 4459 42989 | 78.6 15.8 5.5 3.5 24.8 64.1 | exp/multi_a/tri4/decode_tg_eval2000/score_12_1.0/eval2000.ctm.filt.sys
 multi_a  tri4   tg_eval2000.si     ||  %WER 31.3 | 4459 42989 | 73.1 20.8 6.2 4.4 31.3 68.7 | exp/multi_a/tri4/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys
-multi_a  tri5a  tg_eval2000        ||  %WER 24.3 | 4459 42989 | 78.8 15.6 5.6 3.2 24.3 63.3 | exp/multi_a/tri5a/decode_tg_eval2000/score_13_0.0/eval2000.ctm.filt.sys
-multi_a  tri5a  tg_eval2000.si     ||  %WER 30.6 | 4459 42989 | 73.7 20.2 6.1 4.3 30.6 67.9 | exp/multi_a/tri5a/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys
-multi_a  tri5a  tg_sp_eval2000     ||  %WER 24.2 | 4459 42989 | 79.1 15.6 5.3 3.3 24.2 63.2 | exp/multi_a/tri5a/decode_tg_sp_eval2000/score_12_0.0/eval2000.ctm.filt.sys
-multi_a  tri5a  tg_sp_eval2000.si  ||  %WER 30.5 | 4459 42989 | 73.7 20.3 6.0 4.2 30.5 67.8 | exp/multi_a/tri5a/decode_tg_sp_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys
-multi_a  tri5b  tg_eval2000        ||  %WER 24.3 | 4459 42989 | 79.3 15.7 5.0 3.6 24.3 63.5 | exp/multi_a/tri5b/decode_tg_eval2000/score_11_0.0/eval2000.ctm.filt.sys
-multi_a  tri5b  tg_eval2000.si     ||  %WER 30.7 | 4459 42989 | 73.6 20.4 6.0 4.3 30.7 68.1 | exp/multi_a/tri5b/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys
+multi_a  tri5a  tg_eval2000        ||  %WER 24.5 | 4459 42989 | 79.0 15.7 5.3 3.5 24.5 63.4 | exp/multi_a/tri5a/decode_tg_eval2000/score_12_0.0/eval2000.ctm.filt.sys
+multi_a  tri5a  tg_eval2000.si     ||  %WER 30.4 | 4459 42989 | 73.3 20.0 6.6 3.8 30.4 67.5 | exp/multi_a/tri5a/decode_tg_eval2000.si/score_12_0.5/eval2000.ctm.filt.sys
+multi_a  tri5a  tg_sp_eval2000     ||  %WER 24.5 | 4459 42989 | 78.9 15.7 5.4 3.4 24.5 63.4 | exp/multi_a/tri5a/decode_tg_sp_eval2000/score_12_0.5/eval2000.ctm.filt.sys
+multi_a  tri5a  tg_sp_eval2000.si  ||  %WER 30.5 | 4459 42989 | 73.5 20.1 6.5 4.0 30.5 67.8 | exp/multi_a/tri5a/decode_tg_sp_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys
+multi_a  tri5b  tg_eval2000        ||  %WER 24.4 | 4459 42989 | 79.1 15.6 5.3 3.5 24.4 63.4 | exp/multi_a/tri5b/decode_tg_eval2000/score_12_0.0/eval2000.ctm.filt.sys
+multi_a  tri5b  tg_eval2000.si     ||  %WER 30.5 | 4459 42989 | 73.5 20.2 6.3 4.0 30.5 67.3 | exp/multi_a/tri5b/decode_tg_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys
+multi_a  tri6a  tg_eval2000        ||  %WER 24.5 | 4459 42989 | 78.8 15.7 5.5 3.4 24.5 63.0 | exp/multi_a/tri6a/decode_tg_eval2000/score_13_0.5/eval2000.ctm.filt.sys
+multi_a  tri6a  tg_eval2000.si     ||  %WER 31.5 | 4459 42989 | 73.1 21.0 5.9 4.6 31.5 68.1 | exp/multi_a/tri6a/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys
+multi_a  tri6a  tg_sp_eval2000     ||  %WER 24.6 | 4459 42989 | 78.9 15.8 5.3 3.5 24.6 63.3 | exp/multi_a/tri6a/decode_tg_sp_eval2000/score_12_1.0/eval2000.ctm.filt.sys
+multi_a  tri6a  tg_sp_eval2000.si  ||  %WER 31.5 | 4459 42989 | 72.6 21.0 6.4 4.2 31.5 67.9 | exp/multi_a/tri6a/decode_tg_sp_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys
diff --git a/egs/multi_en/s5/local/make_partitions.sh b/egs/multi_en/s5/local/make_partitions.sh
index 45c079499b4..bf0915029f2 100755
--- a/egs/multi_en/s5/local/make_partitions.sh
+++ b/egs/multi_en/s5/local/make_partitions.sh
@@ -95,11 +95,10 @@ if [ $stage -eq 9 ]; then
   utils/data/remove_dup_utts.sh 300 $data_dir/fisher_swbd_tedlium_wsj_hub4_libri960 $data_dir/tri5b_ali
   ln -nfs tri5b_ali $data_dir/tri6a
   ln -nfs tri5b_ali $data_dir/tri6a_ali
-  ln -nfs tri5b_ali $data_dir/tri6b
 fi
 
+# sampled data for ivector extractor training,.etc
 if [ $stage -eq 10 ]; then
-  ln -nfs tri6a $data_dir/tr6b_ali
   ln -nfs tri6a $data_dir/tdnn
   utils/subset_data_dir.sh $data_dir/tdnn \
     100000 $data_dir/tdnn_100k
diff --git a/egs/multi_en/s5/run.sh b/egs/multi_en/s5/run.sh
index 3b020675c5b..aa554e24dd8 100755
--- a/egs/multi_en/s5/run.sh
+++ b/egs/multi_en/s5/run.sh
@@ -409,48 +409,3 @@ if [ $stage -le 22 ]; then
     done
   )&
 fi
-
-# reestimate LM with silprobs
-dict_affix=${multi}_tri6a
-if [ $stage -le 23 ]; then
-  gmm=tri6a
-  steps/get_prons.sh --cmd "$train_cmd" data/$multi/$gmm ${lang_root}_nosp exp/$multi/$gmm
-  utils/dict_dir_add_pronprobs.sh --max-normalize true \
-    ${dict_root}_nosp exp/$multi/$gmm/pron_counts_nowb.txt \
-    exp/$multi/$gmm/sil_counts_nowb.txt exp/$multi/$gmm/pron_bigram_counts_nowb.txt ${dict_root}_${dict_affix}
-  utils/prepare_lang.sh ${dict_root}_${dict_affix} "<unk>" data/local/lang_${dict_affix} ${lang_root}_${dict_affix}
-  utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
-    ${lang_root}_${dict_affix} data/local/lm/3gram-mincount/lm_unpruned.gz \
-    ${dict_root}_${dict_affix}/lexicon.txt ${lang_root}_${dict_affix}_fsh_sw1_tg
-  # re-decode after re-estimating sil & pron-probs
-  (  
-    gmm=tri6a
-    graph_dir=exp/$multi/$gmm/graph_tg_sp
-    utils/mkgraph.sh ${lang_root}_${dict_affix}_fsh_sw1_tg \
-      exp/$multi/$gmm $graph_dir || exit 1;
-    for e in eval2000 rt03; do
-      steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config $graph_dir \
-        data/$e/test exp/$multi/$gmm/decode_tg_sp_$e || exit 1;
-    done
-  )&
-fi
-
-# Re-train with the updated lexicon using the same data.
-lang=${lang_root}_${dict_affix}
-if [ $stage -le 24 ]; then
-  steps/align_fmllr.sh --cmd "$train_cmd" --nj 100 \
-    data/$multi/tri6a_ali $lang \
-    exp/$multi/tri6a exp/$multi/tri6a_ali || exit 1;
-  steps/train_sat.sh --cmd "$train_cmd" 14000 2400000 \
-    data/$multi/tri6b $lang exp/$multi/tri6a_ali exp/$multi/tri6b || exit 1;
-  (  
-    gmm=tri6b
-    graph_dir=exp/$multi/$gmm/graph_tg
-    utils/mkgraph.sh ${lang}_fsh_sw1_tg \
-      exp/$multi/$gmm $graph_dir || exit 1;
-    for e in eval2000 rt03; do
-      steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config $graph_dir \
-        data/$e/test exp/$multi/$gmm/decode_tg_$e || exit 1;
-    done
-  )&
-fi

From f3549f27db4f053412bcddd67955158275d37501 Mon Sep 17 00:00:00 2001
From: xiaohui-zhang <samuelzhang1104@gmail.com>
Date: Tue, 23 Jan 2018 15:33:12 -0500
Subject: [PATCH 3/7] minor fix

---
 egs/multi_en/s5/run.sh                     | 10 +++++-----
 egs/wsj/s5/utils/data/resample_data_dir.sh |  7 +++++++
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/egs/multi_en/s5/run.sh b/egs/multi_en/s5/run.sh
index aa554e24dd8..418a367f684 100755
--- a/egs/multi_en/s5/run.sh
+++ b/egs/multi_en/s5/run.sh
@@ -59,7 +59,7 @@ if [ $stage -le 1 ]; then
   local/prepare_dict.sh $swbd $tedlium2
   (
    local/g2p/train_g2p.sh --stage 0 --silence-phones \
-     "data/local/dict_combined/silence_phones.txt" data/local/dict_combined exp/g2p
+     "data/local/dict_combined/silence_phones.txt" data/local/dict_combined exp/g2p || touch exp/g2p/.error
   ) &
 fi
 
@@ -105,13 +105,13 @@ fi
 
 # Synthesize pronounciations for OOV words across all training transcripts and produce the final lexicon.
 if [ $stage -le 4 ]; then
-  wait
+  wait # Waiting for train_g2p.sh to finish
   dict_dir=data/local/dict_nosp
   mkdir -p $dict_dir
   rm $dict_dir/lexiconp.txt 2>/dev/null
   cp data/local/dict_combined/{extra_questions,nonsilence_phones,silence_phones,optional_silence}.txt $dict_dir
   local/g2p/apply_g2p.sh --var-counts 1 exp/g2p/model.fst data/local/g2p_phonetisarus \
-    data/local/dict_combined/lexicon.txt $dict_dir/lexicon.txt
+    data/local/dict_combined/lexicon.txt $dict_dir/lexicon.txt || touch $dict_dir/.error
 fi
 
 # We'll do multiple iterations of pron/sil-prob estimation. So the structure of
@@ -144,9 +144,9 @@ if [ $stage -le 7 ]; then
      data=data/$c/train
      steps/make_mfcc.sh --mfcc-config conf/mfcc.conf \
        --cmd "$train_cmd" --nj 40 \
-       $data exp/make_mfcc/$c/train || exit 1;
+       $data exp/make_mfcc/$c/train || touch $data/.error
      steps/compute_cmvn_stats.sh \
-       $data exp/make_mfcc/$c/train || exit 1;
+       $data exp/make_mfcc/$c/train || touch $data/.error
     ) &
   done
   wait
diff --git a/egs/wsj/s5/utils/data/resample_data_dir.sh b/egs/wsj/s5/utils/data/resample_data_dir.sh
index 1e3c31b80c5..b972bcc119a 100755
--- a/egs/wsj/s5/utils/data/resample_data_dir.sh
+++ b/egs/wsj/s5/utils/data/resample_data_dir.sh
@@ -1,6 +1,7 @@
 #! /bin/bash
 
 # Copyright 2016  Vimal Manohar
+#           2018  Xiaohui Zhang
 # Apache 2.0.
 
 if [ $# -ne 2 ]; then
@@ -25,6 +26,12 @@ if [ -f $dir/feats.scp ]; then
   echo "$0: feats.scp already exists. Moving it to $dir/.backup"
 fi
 
+# After resampling we cannot compute utt2dur from wav.scp any more,
+# so we create utt2dur now, in case it's needed later
+if [ ! -s $dir/utt2dur ]; then
+  utils/data/get_utt2dur.sh $dir 1>&2 || exit 1;
+fi
+
 mv $dir/wav.scp $dir/wav.scp.tmp
 cat $dir/wav.scp.tmp | python -c "import sys
 for line in sys.stdin.readlines():

From 22836316dbb6a18302709561fbec137f3e8adb67 Mon Sep 17 00:00:00 2001
From: xiaohui-zhang <samuelzhang1104@gmail.com>
Date: Thu, 25 Jan 2018 20:48:01 -0500
Subject: [PATCH 4/7] minor fix

---
 egs/multi_en/s5/run.sh | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/egs/multi_en/s5/run.sh b/egs/multi_en/s5/run.sh
index 418a367f684..b62ebb4baa9 100755
--- a/egs/multi_en/s5/run.sh
+++ b/egs/multi_en/s5/run.sh
@@ -83,7 +83,7 @@ if [ $stage -le 2 ]; then
   local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.?
   local/wsj_format_data.sh
   utils/copy_data_dir.sh --spk_prefix wsj_ --utt_prefix wsj_ data/wsj/train_si284 data/wsj/train
-  rm -r data/wsj/train_si284 2>/dev/null
+  rm -r data/wsj/train_si284 2>/dev/null || true
   # hub4_en
   local/hub4_en_data_prep.sh $hub4_en_96 $hub4_en_97
   # eval2000 (test)
@@ -106,12 +106,16 @@ fi
 # Synthesize pronounciations for OOV words across all training transcripts and produce the final lexicon.
 if [ $stage -le 4 ]; then
   wait # Waiting for train_g2p.sh to finish
+  if [ -f exp/g2p/.error ]; then
+     rm exp/g2p/.error || true
+     echo "Fail to train the G2P model." && exit 1;
+  fi
   dict_dir=data/local/dict_nosp
   mkdir -p $dict_dir
-  rm $dict_dir/lexiconp.txt 2>/dev/null
+  rm $dict_dir/lexiconp.txt 2>/dev/null || true
   cp data/local/dict_combined/{extra_questions,nonsilence_phones,silence_phones,optional_silence}.txt $dict_dir
   local/g2p/apply_g2p.sh --var-counts 1 exp/g2p/model.fst data/local/g2p_phonetisarus \
-    data/local/dict_combined/lexicon.txt $dict_dir/lexicon.txt || touch $dict_dir/.error
+    data/local/dict_combined/lexicon.txt $dict_dir/lexicon.txt || exit 1;
 fi
 
 # We'll do multiple iterations of pron/sil-prob estimation. So the structure of
@@ -144,9 +148,9 @@ if [ $stage -le 7 ]; then
      data=data/$c/train
      steps/make_mfcc.sh --mfcc-config conf/mfcc.conf \
        --cmd "$train_cmd" --nj 40 \
-       $data exp/make_mfcc/$c/train || touch $data/.error
+       $data exp/make_mfcc/$c/train || exit 1;
      steps/compute_cmvn_stats.sh \
-       $data exp/make_mfcc/$c/train || touch $data/.error
+       $data exp/make_mfcc/$c/train || exit 1;
     ) &
   done
   wait
@@ -155,7 +159,7 @@ fi
 # fix and validate training data directories
 if [ $stage -le 8 ]; then
   # get rid of spk2gender files because not all corpora have them
-  rm data/*/train/spk2gender 2>/dev/null
+  rm data/*/train/spk2gender 2>/dev/null || true
   # create reco2channel_and_file files for wsj and librispeech
   for c in wsj librispeech_100 librispeech_360 librispeech_500; do
     awk '{print $1, $1, "A"}' data/$c/train/wav.scp > data/$c/train/reco2file_and_channel;

From dc18c1c755ca9e8ea4da4ed30fc719992f660611 Mon Sep 17 00:00:00 2001
From: xiaohui-zhang <samuelzhang1104@gmail.com>
Date: Thu, 25 Jan 2018 21:21:35 -0500
Subject: [PATCH 5/7] minor fix

---
 egs/multi_en/s5/run.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/egs/multi_en/s5/run.sh b/egs/multi_en/s5/run.sh
index b62ebb4baa9..3a1262101aa 100755
--- a/egs/multi_en/s5/run.sh
+++ b/egs/multi_en/s5/run.sh
@@ -148,12 +148,16 @@ if [ $stage -le 7 ]; then
      data=data/$c/train
      steps/make_mfcc.sh --mfcc-config conf/mfcc.conf \
        --cmd "$train_cmd" --nj 40 \
-       $data exp/make_mfcc/$c/train || exit 1;
+       $data exp/make_mfcc/$c/train || touch $data/.error
      steps/compute_cmvn_stats.sh \
-       $data exp/make_mfcc/$c/train || exit 1;
+       $data exp/make_mfcc/$c/train || touch $data/.error
     ) &
   done
   wait
+  if [ -f $data/.error ]; then
+     rm $data/.error || true
+     echo "Fail to extract features." && exit 1;
+  fi
 fi
 
 # fix and validate training data directories

From 7eec69d49b1f64ce7a7b9e79dde7e288500d9aa3 Mon Sep 17 00:00:00 2001
From: xiaohui-zhang <samuelzhang1104@gmail.com>
Date: Wed, 7 Feb 2018 00:38:29 -0500
Subject: [PATCH 6/7] fixes #2206

---
 egs/swbd/s5c/local/score_basic.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/egs/swbd/s5c/local/score_basic.sh b/egs/swbd/s5c/local/score_basic.sh
index ff9521b7727..2cb1fafc8d8 100755
--- a/egs/swbd/s5c/local/score_basic.sh
+++ b/egs/swbd/s5c/local/score_basic.sh
@@ -47,8 +47,9 @@ function filter_text {
 
 for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
   $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.${wip}.log \
+    lattice-add-penalty --word-ins-penalty=$wip "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
     lattice-best-path --lm-scale=LMWT --word-symbol-table=$lang/words.txt \
-    "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.${wip}.tra || exit 1;
+      ark:- ark,t:$dir/scoring/LMWT.${wip}.tra || exit 1;
 done
 
 for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do

From 1d4959af8978f002b95ffbe57678685a2eac7e07 Mon Sep 17 00:00:00 2001
From: xiaohui-zhang <samuelzhang1104@gmail.com>
Date: Wed, 7 Feb 2018 00:46:44 -0500
Subject: [PATCH 7/7] minor fixes

---
 egs/multi_en/s5/local/hub4_normalize_bn96_transcripts.pl | 4 ++--
 egs/multi_en/s5/local/hub4_normalize_bn97_transcripts.pl | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/egs/multi_en/s5/local/hub4_normalize_bn96_transcripts.pl b/egs/multi_en/s5/local/hub4_normalize_bn96_transcripts.pl
index 8f092cb717f..81a26ecd5af 100755
--- a/egs/multi_en/s5/local/hub4_normalize_bn96_transcripts.pl
+++ b/egs/multi_en/s5/local/hub4_normalize_bn96_transcripts.pl
@@ -2,14 +2,14 @@
 
 ###########################################################################################
 # This script was copied from egs/hub4_english/s5/local/normalize_bn96_transcripts.pl
-# The source commit was 9f61a1b0efa76f37fc29fa2dbeede6dc776a0203
+# The source commit was 3c96a5fdfc31408fdc0128619a5e3ee4f2cfea6f
 # No change was made
 ###########################################################################################
 
 # Copyright 2017  Vimal Manohar
 # Apache 2.0
 
-@ARGV == 2 ||  die "usage: normalize_transcript.pl noise_word spoken_noise_word < transcript > transcript2";
+@ARGV == 2 ||  die "usage: hub4_normalize_bn96_transcripts.pl noise_word spoken_noise_word < transcript > transcript2";
 $noise_word = shift @ARGV;
 $spoken_noise_word = shift @ARGV;
 
diff --git a/egs/multi_en/s5/local/hub4_normalize_bn97_transcripts.pl b/egs/multi_en/s5/local/hub4_normalize_bn97_transcripts.pl
index 1f23ae15fda..b352f8d81b8 100755
--- a/egs/multi_en/s5/local/hub4_normalize_bn97_transcripts.pl
+++ b/egs/multi_en/s5/local/hub4_normalize_bn97_transcripts.pl
@@ -9,7 +9,7 @@
 # Copyright 2017  Vimal Manohar
 # Apache 2.0
 
-@ARGV == 2 ||  die "usage: normalize_bn97_transcripts.pl noise_word spoken_noise_word < transcript > transcript2";
+@ARGV == 2 ||  die "usage: hub4_normalize_bn97_transcripts.pl noise_word spoken_noise_word < transcript > transcript2";
 $noise_word = shift @ARGV;
 $spoken_noise_word = shift @ARGV;