diff --git a/egs/hub4_english/s5/README b/egs/hub4_english/s5/README
new file mode 100644
index 00000000000..7db319fe174
--- /dev/null
+++ b/egs/hub4_english/s5/README
@@ -0,0 +1,33 @@
+This is the English Broadcast News (HUB4) corpus.
+
+1996 English Broadcast News Train (HUB4)
+   Speech      LDC97S44
+   Transcripts LDC97T22
+
+1997 English Broadcast News Train (HUB4)
+  Speech       LDC98S71
+  Transcripts  LDC98T28
+
+1995 English Broadcast News (CSR-IV HUB4)
+  LDC96S31
+
+North American News Text Corpus
+  LDC95T21
+
+North American News Text Supplement Corpus
+  LDC98T30
+
+1996 CSR HUB4 Language Model
+  LDC98T31
+
+1996 English Broadcast News Dev and Eval (HUB4) 
+  LDC97S66
+
+1997 HUB4 English Evaluation corpus
+  LDC2002S11
+ 
+1998 HUB4 Broadcast News Evaluation English Test Material
+  LDC2000S86 
+
+1999 HUB4 Broadcast News Evaluation English Test Material
+  LDC2000S88
diff --git a/egs/hub4_english/s5/RESULTS b/egs/hub4_english/s5/RESULTS
new file mode 100644
index 00000000000..c6c719f51fb
--- /dev/null
+++ b/egs/hub4_english/s5/RESULTS
@@ -0,0 +1,9 @@
+for x in exp/*/decode*; do grep Sum $x/score*/*.ctm.*sys | utils/best_wer.sh ; done | sort -k2,2n
+exit 0
+
+%WER 17.8 | 728 32834 | 84.1 11.8 4.1 1.9 17.8 82.8 | exp/tri4/decode_nosp_eval97.pem_rescore/score_13_0.5/eval97.pem.ctm.filt.sys
+%WER 19.0 | 728 32834 | 83.0 12.7 4.3 2.0 19.0 84.2 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+%WER 19.4 | 728 32834 | 82.7 13.1 4.2 2.1 19.4 83.8 | exp/tri3/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys
+%WER 20.5 | 728 32834 | 81.7 13.9 4.4 2.3 20.5 85.0 | exp/tri3/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+%WER 23.7 | 728 32834 | 79.0 16.0 5.0 2.7 23.7 85.3 | exp/tri4/decode_nosp_eval97.pem.si/score_12_0.0/eval97.pem.ctm.filt.sys
+%WER 25.7 | 728 32834 | 77.1 17.6 5.3 2.8 25.7 85.9 | exp/tri3/decode_nosp_eval97.pem.si/score_13_0.0/eval97.pem.ctm.filt.sys
diff --git a/egs/hub4_english/s5/cmd.sh b/egs/hub4_english/s5/cmd.sh
new file mode 100755
index 00000000000..43f7b21771a
--- /dev/null
+++ b/egs/hub4_english/s5/cmd.sh
@@ -0,0 +1,14 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 1G"
+export decode_cmd="queue.pl --mem 4G"
diff --git a/egs/hub4_english/s5/conf/mfcc.conf b/egs/hub4_english/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..7361509099f
--- /dev/null
+++ b/egs/hub4_english/s5/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/hub4_english/s5/conf/vad.conf b/egs/hub4_english/s5/conf/vad.conf
new file mode 100644
index 00000000000..a0ca2449b10
--- /dev/null
+++ b/egs/hub4_english/s5/conf/vad.conf
@@ -0,0 +1,2 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
diff --git a/egs/hub4_english/s5/local/data_prep/csr4_utils.patch b/egs/hub4_english/s5/local/data_prep/csr4_utils.patch
new file mode 100644
index 00000000000..1b7dcb4ec1b
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/csr4_utils.patch
@@ -0,0 +1,793 @@
+diff -Naur tools/csr4_utils/abbrproc.perl local/data_prep/csr_hub4_utils/abbrproc.perl
+--- tools/csr4_utils/abbrproc.perl	1996-08-27 15:25:15.000000000 -0400
++++ local/data_prep/csr_hub4_utils/abbrproc.perl	2017-11-03 13:22:09.466213159 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ # $Id: abbrproc.perl,v 1.3 1996/08/21 20:05:09 robertm Rel $
+ ###############################################################################
+ # This software is being provided to you, the LICENSEE, by the Massachusetts  #
+diff -Naur tools/csr4_utils/artfilter.perl local/data_prep/csr_hub4_utils/artfilter.perl
+--- tools/csr4_utils/artfilter.perl	1996-01-04 11:31:57.000000000 -0500
++++ local/data_prep/csr_hub4_utils/artfilter.perl	2017-11-03 13:22:09.470213159 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ 
+ # artfilter.perl 
+ 
+diff -Naur tools/csr4_utils/bugproc.perl local/data_prep/csr_hub4_utils/bugproc.perl
+--- tools/csr4_utils/bugproc.perl	1996-08-27 15:25:15.000000000 -0400
++++ local/data_prep/csr_hub4_utils/bugproc.perl	2017-11-03 13:22:09.474213159 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ # $Id: bugproc.perl,v 1.4 1996/08/21 23:55:40 robertm Rel $
+ ###############################################################################
+ # This software is being provided to you, the LICENSEE, by the Massachusetts  #
+diff -Naur tools/csr4_utils/do-lm local/data_prep/csr_hub4_utils/do-lm
+--- tools/csr4_utils/do-lm	1996-08-27 15:25:15.000000000 -0400
++++ local/data_prep/csr_hub4_utils/do-lm	2017-11-27 14:21:15.965400509 -0500
+@@ -22,19 +22,22 @@
+ 	exit 1
+ fi
+ 
+-PATH=$PATH:./bin ; export PATH
++dir=$1
++shift
+ 
+ for file in $*
+ do
+ 	BASENM=`basename $file`
++  name="${BASENM%.*}"
++
+ 	echo "Running LM pipeline for |$BASENM|..." 1>&2
+ 	set -x
+-	pare-sgml.perl $file |
+-	 bugproc.perl |
+-	 numhack.perl |
+-	 numproc.perl |
+-	 abbrproc.perl |
+-	 puncproc.perl > lm/$BASENM
++  gunzip -c $file | pare-sgml.perl | \
++    bugproc.perl | \
++    numhack.perl | \
++    numproc.perl -xtools/csr4_utils/num_excp | \
++    abbrproc.perl tools/csr4_utils/abbrlist | \
++    puncproc.perl -np | gzip -c > $dir/$name.txt.gz
+ 	set +x
+ 	echo "Done with $BASENM."
+ done
+diff -Naur tools/csr4_utils/numhack.perl local/data_prep/csr_hub4_utils/numhack.perl
+--- tools/csr4_utils/numhack.perl	1996-08-27 15:25:16.000000000 -0400
++++ local/data_prep/csr_hub4_utils/numhack.perl	2017-11-03 13:22:09.482213158 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ 
+ # $Id: numhack.perl,v 1.4 1996/08/23 05:12:27 robertm Rel $
+ # preprocessor for numproc, potentially specialized for Broadcast News material
+diff -Naur tools/csr4_utils/numproc.perl local/data_prep/csr_hub4_utils/numproc.perl
+--- tools/csr4_utils/numproc.perl	1996-08-27 15:25:16.000000000 -0400
++++ local/data_prep/csr_hub4_utils/numproc.perl	2017-11-08 16:59:50.497562934 -0500
+@@ -1,4 +1,5 @@
+-#!/usr/local/bin/perl
++#! /usr/bin/perl
++#
+ # $Id: numproc.perl,v 1.7 1996/08/23 05:04:11 robertm Rel $
+ ###############################################################################
+ # This software is being provided to you, the LICENSEE, by the Massachusetts  #
+@@ -74,7 +75,7 @@
+ {	if($ARGV[$i] =~ /^-/)
+ 	{	if($ARGV[$i] =~ /^-v/) {$vflg=1;}
+ 		elsif($ARGV[$i] =~ /^-x/)
+-		{	$exfile=$ARGV[i];
++		{	$exfile=$ARGV[$i];
+ 			$exfile =~ s/^-x//;
+ 		}
+ 		else {&perr2("illegal flag: $ARGV[$i]");}
+@@ -237,7 +238,7 @@
+ 	if(/\d/ && !/^<\/?[spa]/)		# opt and protect sgml
+ 	{	@input = split(/\s+/o);
+ 		@output=();
+-	wloop:	for($field=0;$field<=$#input;$field++)	# $field is global
++	for($field=0;$field<=$#input;$field++)	# $field is global
+ 		{	if($field>0) {$last=$input[$field-1];}
+ 			else {$last='';}
+ 			if($field<$#input) {$next=$input[$field+1];}
+@@ -248,27 +249,27 @@
+ 			$_=$input[$field];
+ 	
+ 			if(/<[\w\.\/]*>/o && !/<p/o && !/<\/p>/o) # pass only
+-				{&perr("spurious SGML: $_");}	# <p... and </p>
++				{&perr("spurious SGML: $_"); next; }	# <p... and </p>
+ 	
+ 			if(/[0-9]/o && !/<p/o)		# number but not <p
+ 			{	if(/[\$\#]/o)			# money
+-					{&money($_,$next);}
++					{if (! &money($_,$next)) {next;} }
+ 				elsif(/\d:\d\d$/o || /\d:\d\d\D/o)	# time
+-					{&printtime($_);}
++					{if (! &printtime($_)) {next;} }
+ 				elsif(/\d+\/\d+\/\d+/o)		# x/x/x date
+-					{&printdate($_);}
++					{if (! &printdate($_)) {next;} }
+ 				elsif((/[a-zA-Z].*\d/ || /\d.*[a-zA-Z]/)
+ 				      && 
+ 				      !(/\dth\W*/i || /1st\W*/i || /2nd\W*/i
+ 					|| /3rd\W*/i
+ 					|| (/\d\'?s\W*/
+ 					    && (! /\d[a-zA-Z]+\d+\'?s\W*$/))))
+-					{&printserno($_);}	 # serial no
++					{if (! &printserno($_)) {next;} }	 # serial no
+ 				elsif(/\//o)			# fraction
+-					{&printfrac($_);}
++					{if (! &printfrac($_)) {next;} }
+ 				elsif(/\d\'-?\d+/o)		# ft inches
+-					{&printftin($_);}
+-				else {&printnum($_); }	      # ordinary number
++					{if (! &printftin($_)) {next;} }
++				else {if (! &printnum($_)) {next;} }	      # ordinary number
+ 			}
+ 			else {&pusho($_ );}		# non-numeric string
+ 		}
+@@ -348,7 +349,7 @@
+ 		$subunit_sing='penny';
+ 		$subunit_pl='pence';
+ 	}
+-	else {&perr("money: unknown currency");}
++	else {&perr("money: unknown currency"); return 0;}
+ 
+ 	($back)=/(\D*)$/;
+ 	$back =~ s/^s//;	# $40s -> $40
+@@ -362,32 +363,32 @@
+ 	if($x =~ /\//)
+ 	{	$x =~ s/^\D*//;
+ 		$x =~ s/\D*$//;
+-		&printfrac($x);
++		if (! &printfrac($x)) {return 0;}
+ 		&pusho("of a $unit");
+ 		$x="";
+ 		$plural=0;
+ 	}
+ 
+ 	$x =~ s/^\D*([\d,]*)\D*.*$/$1/;		# int part of string
+-	if($x ne "") {&printint($x);}		# print int part (eg. dollars)
++	if($x ne "") {if (! &printint($x)) {return 0;} }		# print int part (eg. dollars)
+ 
+ 	if($next eq "and" && $next2 =~ /\d\/\d/ && next2 !~ /\/.*\//)
+ 	{	if($unit && $x ne "") {&pusho("and");}	      # frac: eg 4 1/16
+ 		$z=$next2;
+ 		$z =~ s/\D*$//;
+-		&printfrac($z);
++		if (! &printfrac($z)) {return 0;}
+ 		($punct)=($next2 =~ /(\D*)$/);
+ 		$field+=2;
+ 		&pusho("${unit}s");
+ 	
+-		if($back) {&perr("money: back and 1 1/3");}
++		if($back) {&perr("money: back and 1 1/3"); return 0;}
+ 		
+ 		if($punct) {&appendo($punct);}	# punctuation from *illion
+-		return;
++		return 1;
+ 	}
+ 
+ 	if($back eq "" && $next =~ /^(thousands?|[a-z]*illions?)(\W*)/i)
+-	{	&printdecfrac($_);			# multiplier
++	{	if (! &printdecfrac($_)) {return 0;}			# multiplier
+ 		&pusho($1);
+ 		$punct=$2;
+ 		$plural=1;			### if adj '', if noun 's'
+@@ -395,7 +396,7 @@
+ 		$frac=1;
+ 	}
+ 	elsif(/\.\d$/ || /\.\d\D/ || /\.\d{3}/ )	# .d or .ddd+
+-	{	&printdecfrac($_);
++	{	if (! &printdecfrac($_)) {return 0;}
+ 		$plural=1;			# can be either
+ 		$frac=1;
+ 	}
+@@ -409,7 +410,7 @@
+ 	{	$unit="";			# fix "$1 dollar" wsj typo
+ 		$subunit_sing="";
+ 		$subunit_pl="";
+-		&printdecfrac($_);
++		if (! &printdecfrac($_)) {return 0;}
+ 		$frac=1;
+ 	}
+ 
+@@ -447,24 +448,26 @@
+ 	{	$y=$_;
+ 		$y =~ s/^[^\.]*\.([\d]*)\D?.*$/$1/;	# get fractional part
+ 		if($unit && $x ne "") {&pusho("and");}
+-		&printint($y);
++		if (! &printint($y)) {return 0;}
+ 		if($sing || int($y)==1) {&pusho($subunit_sing);}
+ 		else {&pusho($subunit_pl);}
+ 	}
+ 
+ 	if($back)				# punctuation from this field
+-	{	if($punct) {&perr("money: back and punct");}
++	{	if($punct) {&perr("money: back and punct"); return 0;}
+ 
+ 		if($back =~ /^\w/) {&pusho($back);}
+ 		else {&appendo($back);}
+ 	}
+ 		
+ 	if($punct) {&appendo($punct);}		# punctuation from *illion
++
++  return 1;
+ }
+ 
+ sub printyear			# &printyear(x)
+ {	if($vflg) {print "printyear: $_[0]\n";}
+-	&printnum($_[0]);		# for now
++	return &printnum($_[0]);		# for now
+ }
+ 
+ sub printtime			# &printtime(x)
+@@ -475,7 +478,7 @@
+ 	local($front);
+ 	local($back);
+ 
+-	if(/:{2,}/ || !/\d:\d/) {&perr("printtime: not a time");}
++	if(/:{2,}/ || !/\d:\d/) {&perr("printtime: not a time"); return 0;}
+ 
+ 	@x=split(/:/,$_);
+ 	($front)=($x[0] =~ /^(\D*)/);
+@@ -487,20 +490,21 @@
+ 	{	&pusho($front);			# generally punctuation
+ 		if($front !~ /\w$/) {$appendflg=1;}
+ 	}
+-	&printint($x[0]);
++	if (! &printint($x[0])) {return 0;}
+ 	if($x[1]==0)
+ 	{	$_=$next;
+ 		if(!/^[aApP]\.?[nM]\.?$/) {&pusho("o'clock");}
+ 	}
+ 	elsif ($x[1]<10)
+ 	{	&pusho("oh");
+-		&printint($x[1]);
++		if (!&printint($x[1])) {return 0;}
+ 	}
+-	else {&printint($x[1]);}
++	else {if (! &printint($x[1])) {return 0;} }
+ 	if($back)
+ 	{	if($back =~ /^\w/) {&pusho($back);}
+ 		else {&appendo($back);}		# generally punctuation
+ 	}
++  return 1;
+ }
+ 
+ sub printfrac
+@@ -530,8 +534,8 @@
+ 	}
+ 
+ 	@z=split(/\//,$x);
+-	if($#z !=1) {&perr("printfrac: illegal fraction: $_[0]");}
+-	if($z[1] <= 1) {&perr("printfrac: den too small: $_[0]");}
++	if($#z !=1) {&perr("printfrac: illegal fraction: $_[0]"); return 0;}
++	if($z[1] <= 1) {&perr("printfrac: den too small: $_[0]"); return 0;}
+ 
+ 	if($front) 
+ 	{	&pusho($front);
+@@ -541,22 +545,22 @@
+ 
+ 	if($sign) {&pusho($sign);}
+ 
+-	&printint($z[0]);			#numerator
++	if (! &printint($z[0])) { return 0;}			#numerator
+ 	if($z[1] <= $#den)			# small den from table (<20)
+ 	{	&pusho($den[$z[1]]);
+-		if($z[0]!=1) {&pluralize;}
++		if($z[0]!=1) {if (! &pluralize) {return 0;} }
+ 	}
+ 	else					#large den
+ 	{	$ones=int($z[1]%100);
+ 		$hun=100*int($z[1]/100);
+-		if($hun>0) {&printint($hun);}
++		if($hun>0) {if (!&printint($hun)) {return 0;} }
+ 		if($ones==0) 
+ 		{	&appendo("th");
+-			if($z[0]!=1) {&pluralize;}
++			if($z[0]!=1) {if (! &pluralize) {return 0;} }
+ 		}
+ 		elsif($ones<=$#largeden)		# <20
+ 		{	&pusho($largeden[$ones]);
+-			if($z[0]!=1) {&pluralize;};
++			if($z[0]!=1) {if (!&pluralize) {return 0;} }
+ 		}
+ 		else
+ 		{	$x=int($ones%10);
+@@ -569,11 +573,11 @@
+ 			}
+ 			if($x==0)
+ 			{	&pusho("th");
+-				if($z[0]!=1) {&pluralize;}
++        if($z[0]!=1) {if (! &pluralize) {return 0;} }
+ 			}
+ 			else
+ 			{	&pusho($largeden[$x]);
+-				if($z[0]!=1) {&pluralize;}
++        if($z[0]!=1) {if (! &pluralize) {return 0;} }
+ 			}
+ 		}
+ 	}
+@@ -585,6 +589,8 @@
+ 			&appendo($back);
+ 		}
+ 	}
++  
++  return 1;
+ }
+ 
+ sub printnum			# printnum(n)
+@@ -624,7 +630,7 @@
+ 		$x =~ s/\D*$//;			# strip back: final . is punct
+ 	}
+ 
+-	if($x =~ /[^\d\.,]/) {&perr("printnum: $_[0] is not a number");}
++	if($x =~ /[^\d\.,]/) {&perr("printnum: $_[0] is not a number"); return 0;}
+ 
+ 	if($x!=0 && $x =~ /^0/ && $x =~ /^\d*$/)	# "oh" numbers
+ 	{	if($front) 
+@@ -641,7 +647,7 @@
+ 
+ 		if($back)
+ 		{	if($back =~ /^s$/ || $back =~ /^s\W/)	# back = s
+-			{	&pluralize;			# eg. 1960s
++			{	if (! &pluralize) {return 0;}			# eg. 1960s
+ 				$back =~ s/^s//;
+ 			}
+ 			if($back)
+@@ -649,7 +655,7 @@
+ 				else {&appendo($back);}	# back = punct or "'s"
+ 			}
+ 		}
+-		return;
++		return 1;
+ 	}
+ 
+ 	if($x =~ /^\d/)			# get integer part
+@@ -675,48 +681,48 @@
+ 	if($sign) { &pusho($sign); }
+ 
+ 	$ones=int($intpart%100);
+-	if($comma) {&printint($intpart);}
++	if($comma) {if (! &printint($intpart)) {return 0;} }
+ 	elsif(($intpart>=1900 || $intpart>=1100 && $ones==0)
+ 		&& $intpart<2000 && !$fracpart)			#4 digit -> 2+2
+ 	{	$hun=int($intpart/100);
+-		&printint($hun);
+-		if($ones>=10) {&printint($ones);}
++		if (! &printint($hun)) {return 0;}
++		if($ones>=10) {if (! &printint($ones)) {return 0;} }
+ 		elsif($ones>0)
+ 		{	&pusho("oh");
+-			&printint($ones);
++			if (! &printint($ones)) {return 0;}
+ 		}
+ 		else {&pusho("hundred");}
+ 	}
+ 	else
+-	{	&printint($intpart);
++	{	if (! &printint($intpart)) {return 0;}
+ 		$y=$last;
+ 		$y =~ s/^\W*//;				# thize dates: May 25th
+ 		if(length($intpart)<=2 && $months{$y})
+-		{	&thize("");
++		{	if (! &thize("")) {return 0;}
+ 			$back =~ s/[a-z]//g;
+ 		}
+ 	}
+-	if($fracpart) {&printdecfrac($fracpart);}
++	if($fracpart) {if (! &printdecfrac($fracpart)) {return 0;} }
+ 
+ 	if($back)
+ 	{	if($back =~ /^s$/ || $back =~ /^s\W/)	# back = s
+-		{	&pluralize;			# eg. 1960s
++		{	if (! &pluralize) {return 0;}			# eg. 1960s
+ 			$back =~ s/^s//;
+ 		}
+ 		if($back =~ /^st$/ || $back =~ /^st\W/)	# back= st
+-		{	&thize("st");			# eg. 1st
++		{	if (! &thize("st")) {return 0;}			# eg. 1st
+ 			$back =~ s/^st//;
+ 		}
+ 		if($back =~ /^nd$/ || $back =~ /^nd\W/)	# back= nd
+-		{	&thize("nd");			# eg. 2nd
++		{	if (! &thize("nd")) {return 0;}			# eg. 2nd
+ 			$back =~ s/^nd//;
+ 		}
+ 		if($back =~ /^rd$/ || $back =~ /^rd\W/)	# back= rd
+-		{	&thize("rd");			# eg. 3rd
++		{	if (! &thize("rd")) {return 0;}			# eg. 3rd
+ 			$back =~ s/^rd//;
+ 		}
+ 		if($back =~ /^th$/ || $back =~ /^th\W/)	# back= th
+-		{	&thize("th");			# eg. 4th
++		{	if (! &thize("th")) {return 0;}			# eg. 4th
+ 			$back =~ s/^th//;
+ 		}
+ 		if($back)
+@@ -724,6 +730,7 @@
+ 			else {&appendo($back);}	# back = punct or "'s"
+ 		}
+ 	}
++  return 1;
+ }
+ 
+ sub printdate			# printdate(n):	x/x/x format
+@@ -741,7 +748,7 @@
+ 	$back=$1;
+ 
+ 	if($x !~ /^\d{1,2}\/\d{1,2}\/(19)?\d{2}$/)
+-		{&perr("printdate: $_[0] is not a date");}
++		{&perr("printdate: $_[0] is not a date"); return 0;}
+ 
+ 	@y=split(/\//,$x);
+ 	$y[2] =~ s/^19(\d{2})$/$1/;
+@@ -752,20 +759,21 @@
+ 		$appendflg=1;
+ 	}
+ 
+-	&printint($y[0]);
++	if (! &printint($y[0])) {return 0;}
+ 	&appendo("/");
+ 
+ 	$appendflg=1;
+-	&printint($y[1]);
++	if (! &printint($y[1])) {return 0;}
+ 	&appendo("/");
+ 
+ 	$appendflg=1;
+-	&printint($y[2]);
++	if (! &printint($y[2])) {return 0;}
+ 
+ 	if($back)
+ 	{	if($back =~ /^[a-zA-Z]/) {&appendo("-");}
+ 		&appendo($back);
+ 	}
++  return 1;
+ }
+ 
+ sub printserno			# printserno(n): eg. B1, 3b2, 10W-40
+@@ -815,12 +823,12 @@
+ 		}		     # (should expand here unless in dictionary)
+ 		$x =~ s/^(\d*)//;	# strip off dig
+ 		$y=$1;
+-		if($y ne "") { &printdigstr($y); }
++		if($y ne "") { if (! &printdigstr($y)) {return 0;} }
+ 	}
+ 
+ 	if($back =~ /^s\b/)	# back = s
+ 	{			# eg. 2C60s
+-	    &pluralize;
++	    if (! &pluralize) {return 0;} 
+ 	    $back =~ s/^s//;
+ 	}
+ 	if($back)
+@@ -828,6 +836,7 @@
+ 		else {&appendo($back);}
+ 	}
+ 	$appendflg=0;
++  return 1;
+ }
+ 
+ sub printdigstr			# printdigstr(x)
+@@ -841,14 +850,13 @@
+ 	if($x =~ /^0/)			# leading zero
+ 	{	while($x ne "")
+ 		{	$x =~ s/^(.)//;
+-			if($1 !~ /\d/) {&perr("printdigstr: non-digit");}
++			if($1 !~ /\d/) {&perr("printdigstr: non-digit"); return 0;}
+ 			&pusho("$ones_z[$1]");
+ 		}
+ 		return;
+ 	}
+ 	if($x =~ /^\d0*$/)		# d, d0, d00, d000, etc
+-	{	&printint($x);
+-		return;
++	{	return &printint($x);
+ 	}
+ 
+ 	$_=$x;
+@@ -857,30 +865,29 @@
+ 	for($k=0;$y[$k]==0;$k++) {}			# k= nr following 0s
+ 
+ 	if($j==2)			# 2 dig
+-	{	&printint($x);
+-		return;
++	{	return &printint($x);
+ 	}
+ 	if($j==3)
+-	{	&printint($y[2]);
++	{	if (! &printint($y[2])) {return 0;}
+ 		if($y[1]==0) {&pusho("oh");}
+-		&printint("$y[1]$y[0]");
+-		return;
++		return &printint("$y[1]$y[0]");
+ 	}
+ 	if($j==5 && $k<=2)
+-	{	&printint("$y[4]");
++	{	if (! &printint("$y[4]")) {return 0;}
+ 		$j=4;
+ 	}
+ 	if($j==4)
+-	{	&printint("$y[3]$y[2]");
++	{	if (! &printint("$y[3]$y[2]")) {return 0;}
+ 		if($k==2) {&pusho("hundred");}
+ 		else
+ 		{	if($y[1]==0) {&pusho("oh");}
+-			&printint("$y[1]$y[0]");
++			return &printint("$y[1]$y[0]");
+ 		}
+-		return;
++		return 1;
+ 	}
+ 						# >5 dig: just sequential dig
+ 	for($j--;$j>=0;$j--) {&pusho("$ones_oh[$y[$j]]");}
++  return 1;
+ }
+ 
+ sub printftin			# printftin(n): eg. 6\'-4\"
+@@ -905,19 +912,19 @@
+ 
+ 	$x =~ s/^([\d\.]*)//;	# strip off dig & .
+ 	$y=$1;
+-	if(!$y) {&perr("printftin: bad feet");}
+-	&printnum($y);
++	if(!$y) {&perr("printftin: bad feet"); return 0;}
++	if (! &printnum($y)) {return 0;}
+ 	if($y==1) {&appendo("-foot");}
+ 	else {&appendo("-feet");}
+ 
+ 	$x =~ s/^\'//;	# strip off \'
+ 	$x =~ s/^-//;	# strip off -
+-	if(!$x) {&perr("printftin: bad intermed");}
++	if(!$x) {&perr("printftin: bad intermed"); return 0;}
+ 
+ 	$x =~ s/^([\d\.]*)//;	# strip off dig & .
+ 	$y=$1;
+-	if(!$y) {&perr("printftin: bad inches");}
+-	&printnum($y);
++	if(!$y) {&perr("printftin: bad inches"); return 0;}
++	if (! &printnum($y)) {return 0;}
+ 	if($y==1) {&appendo("-inch");}
+ 	else {&appendo("-inches");}
+ 
+@@ -925,6 +932,7 @@
+ 	{	if($back !~ /^[a-zA-Z]/) {&appendo($back);}
+ 		else {&pusho($back);}
+ 	}
++  return 1;
+ }
+ 
+ sub printint			# printint(x)
+@@ -968,13 +976,14 @@
+ 			}
+ 			if(int($j/3)>0)
+ 			{	if(int($j/3) > $#mult)
+-					{ &perr("printint: too big"); }
++					{ &perr("printint: too big"); return 0;}
+ 				&pusho($mult[int($j/3)]);
+ 			}
+ 			$commanextflg=1;
+ 		}
+ 	}
+ 	$commanextflg=0;
++  return 1;
+ }
+ 
+ sub printdecfrac
+@@ -989,6 +998,8 @@
+ 	if($leadingzeroflg)
+ 		{for($j=0;$j<=$#y;$j++) { &pusho($ones_z[$y[$j]]);}}
+ 	else {for($j=0;$j<=$#y;$j++) { &pusho($ones_oh[$y[$j]]);}}
++
++  return 1;
+ }
+ 
+ sub pluralize		# pluralize(): pluralize last entry on output stack
+@@ -1016,7 +1027,9 @@
+ 		$x =~ s/y$/ies/;
+ 		&pusho($x);
+ 	}
+-	else {&perr("pluralize: unknown word: $_");}
++	else {&perr("pluralize: unknown word: $_"); return 0;}
++
++  return 1;
+ }
+ 
+ sub thize		# thize(): add th to last entry on output stack
+@@ -1028,50 +1041,51 @@
+ 	$_=&geto;
+ 	if( /four$/ || /six$/ || /seven$/ || /ten$/ ||
+ 		/eleven$/ || /een$/ || /hundred$/ || /thousand$/ || /illion$/ )
+-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");} # xth
++	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} # xth
+ 		&appendo("th");
+ 	}
+ 	elsif( /one$/ )						# 1st
+-	{	if($y && $y ne "st") {&perr("thize: mismatch: $_ $y\n");}
++	{	if($y && $y ne "st") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ 		$x=&popo();
+ 		$x =~ s/one$/first/;
+ 		&pusho($x);
+ 	}
+ 	elsif( /two$/ )						# 2nd
+-	{	if($y && $y ne "nd") {&perr("thize: mismatch: $_ $y\n");}
++	{	if($y && $y ne "nd") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ 		$x=&popo();
+ 		$x =~ s/two$/second/;
+ 		&pusho($x);
+ 	}
+ 	elsif( /three$/ )					# 3rd
+-	{	if($y && $y ne "rd") {&perr("thize: mismatch: $_ $y\n");}
++	{	if($y && $y ne "rd") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ 		$x=&popo();
+ 		$x =~ s/three$/third/;
+ 		&pusho($x);
+ 	}
+ 	elsif( /five$/ || /twelve$/ )				# 5th, 12th
+-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");}
++	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ 		$x=&popo();
+ 		$x =~ s/ve$/fth/;
+ 		&pusho($x);
+ 	}
+ 	elsif(/eight$/)
+-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");} # 8th
++	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} # 8th
+ 		&appendo("h");
+ 	}
+ 	elsif( /nine$/ )
+-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");}
++	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ 		$x=&popo();
+ 		$x =~ s/nine$/ninth/;
+ 		&pusho($x);
+ 	}
+ 	elsif( /ty$/ )
+-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");}
++	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ 		$x=&popo();
+ 		$x =~ s/ty$/tieth/;
+ 		&pusho($x);
+ 	}
+-	else {&perr("thize: unknown word: $_");}
++	else {&perr("thize: unknown word: $_"); return 0;j}
++  return 1;
+ }
+ 
+ sub pusho				# pusho($x): push output
+@@ -1089,17 +1103,17 @@
+ sub appendo				# appendo($x): append to output
+ {	$appendflg=0;		
+ #	if($#output < 0) {&pusho("");}
+-	if($#output < 0) {&perr("appendo: output empty");}
++	if($#output < 0) {&perr("appendo: output empty"); return 0;}
+ 	$output[$#output] .= @_[0];
+ }
+ 
+ sub popo				# popo(): pop last output
+-{	if($#output < 0) {&perr("popo: output empty");}
++{	if($#output < 0) {&perr("popo: output empty"); return 0;}
+ 	pop(@output);
+ }
+ 
+ sub geto				# geto(): get last output
+-{	if($#output < 0) {&perr("geto: output empty");}
++{	if($#output < 0) {&perr("geto: output empty"); return 0;}
+ 	return $output[$#output];
+ }
+ 
+@@ -1111,8 +1125,6 @@
+ 	$appendflg=0;
+ 	$commanextflg=0;
+ 	&pusho($this);
+-	$field++;		# graceful error recovery
+-	goto wloop;
+ }
+ 
+ sub perr2
+diff -Naur tools/csr4_utils/pare-sgml.perl local/data_prep/csr_hub4_utils/pare-sgml.perl
+--- tools/csr4_utils/pare-sgml.perl	1996-08-27 15:25:17.000000000 -0400
++++ local/data_prep/csr_hub4_utils/pare-sgml.perl	2017-11-03 13:22:09.486213159 -0400
+@@ -1,11 +1,14 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ 
+ # $Id: pare-sgml.perl,v 1.3 1996/08/15 02:51:17 robertm Rel $
+ # removes extraneous headers and other non-LM fields
+ # translates <DOC ...> into LM-standard <art ...>
+ # removes comments (enclosed in brackets)
+ 
+-$intext=0;
++use strict;
++use warnings;
++
++my $intext=0;
+ while (<>)
+ {
+     if ($intext == 0)
+diff -Naur tools/csr4_utils/process_filelist.sh local/data_prep/csr_hub4_utils/process_filelist.sh
+--- tools/csr4_utils/process_filelist.sh	1969-12-31 19:00:00.000000000 -0500
++++ local/data_prep/csr_hub4_utils/process_filelist.sh	2017-11-03 13:22:09.490213160 -0400
+@@ -0,0 +1,30 @@
++#! /bin/bash
++
++set -e 
++set -o pipefail
++set -u
++set -x
++
++if [ $# -ne 2 ]; then
++  echo "Usage: $0 <filelist> <dir>"
++  exit 1
++fi
++
++filelist=$1
++dir=$2
++
++export PATH=$PATH:tools/csr4_utils
++
++for file in `cat $filelist`; do
++	BASENM=`basename $file`
++  name="${BASENM%.*}"
++
++	echo "Running LM pipeline for |$BASENM|..." 1>&2
++  gunzip -c $file | pare-sgml.perl | \
++    bugproc.perl | \
++    numhack.perl | \
++    numproc.perl -xtools/csr4_utils/num_excp | \
++    abbrproc.perl tools/csr4_utils/abbrlist | \
++    puncproc.perl -np | gzip -c > $dir/$name.txt.gz
++	echo "Done with $BASENM."
++done
+diff -Naur tools/csr4_utils/progsummary.perl local/data_prep/csr_hub4_utils/progsummary.perl
+--- tools/csr4_utils/progsummary.perl	1996-07-12 09:26:35.000000000 -0400
++++ local/data_prep/csr_hub4_utils/progsummary.perl	2017-11-03 13:22:09.494213160 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ 
+ # Program:	progsummary.perl
+ # Written by:	dave graff
+diff -Naur tools/csr4_utils/puncproc.perl local/data_prep/csr_hub4_utils/puncproc.perl
+--- tools/csr4_utils/puncproc.perl	1996-08-27 15:25:17.000000000 -0400
++++ local/data_prep/csr_hub4_utils/puncproc.perl	2017-11-03 13:22:09.494213160 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ 
+ # $Id: puncproc.perl,v 1.2 1996/08/05 16:12:42 robertm Rel $
+ ###############################################################################
+@@ -59,7 +59,7 @@
+ 						# forbidden symbols
+ 	if(/</) {&perr("<");}				# <
+ 	if(/>/) {&perr(">");}				# >
+-	if(/\$/) {&perr("$");}				# $
++	if(/\$/) {&perr("\$");}				# $
+ 	if(/_/) {&perr("_");}				# _
+ 	if(/\d/) {&perr("[0-9]");}			# 0-9
+ 
+diff -Naur tools/csr4_utils/tr-bn-char.fast.perl local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl
+--- tools/csr4_utils/tr-bn-char.fast.perl	1996-08-21 02:39:12.000000000 -0400
++++ local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl	2017-11-03 13:22:09.502213160 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl -pi.old-char
++#!/usr/bin/perl -pi.old-char
+ 
+ # handles nonprinting characters in Broadcast News material, to the extent
+ # that they can be handled, and perhaps a bit beyond...
+diff -Naur tools/csr4_utils/tr-bn-char.slow.perl local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl
+--- tools/csr4_utils/tr-bn-char.slow.perl	1996-08-21 01:30:18.000000000 -0400
++++ local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl	2017-11-03 13:22:09.502213160 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl -p
++#!/usr/bin/perl -p
+ 
+ # handles nonprinting characters in Broadcast News material, to the extent
+ # that they can be handled, and perhaps a bit beyond...
diff --git a/egs/hub4_english/s5/local/data_prep/format_1996_bn_data.pl b/egs/hub4_english/s5/local/data_prep/format_1996_bn_data.pl
new file mode 100755
index 00000000000..84913e9a8b0
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/format_1996_bn_data.pl
@@ -0,0 +1,131 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright (c) 2017  Johns Hopkins University 
+#                        (Author: Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+use List::Util qw(max);
+
+my $audio_width=1;
+my $speaker_width=1;
+my $time_width=1;
+
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+
+if (@ARGV != 3) {
+  print STDERR "$0: Error: Unsupported number of arguments: " . scalar @ARGV ."\n";
+  print STDERR "  Usage: $0 <audio-files> <transripts> <destination>\n";
+  print STDERR "  where\n";
+  print STDERR "    <audio-files> is a file containing list of audio files\n";
+  print STDERR "      (single absolute path name per line)\n";
+  print STDERR "    <transcripts> is a file containing transcripts obtained\n";
+  print STDERR "      obtained by processing the official SGML format\n";
+  print STDERR "      transcripts. See parse_sgm.pl for further info.\n";
+  print STDERR "    <destination> target directory (should already exist)\n";
+  print STDERR "  See also: local/parse_sgm.pl\n";
+  die;
+}
+
+my $audio_files = $ARGV[0];
+my $transcripts = $ARGV[1];
+my $out = $ARGV[2];
+
+my %AUDIO;
+open(my $audio_f, "<", $audio_files) 
+  or die "$0: Error: Could not open $audio_files: $!\n";
+while(my $line = <$audio_f>) {
+  chomp $line;
+  (my $basename = $line) =~ s/.*\/([^\/]+).sph/$1/g;
+  $basename =~ s/_$//g;
+  $AUDIO{$basename} = $line;
+}
+close($audio_f);
+
+my %TRANSCRIPT;
+open(my $transcript_f, "<:encoding(utf-8)", $transcripts)
+  or die "$0: Error: Could not open $transcripts: $!\n";
+while(my $line = <$transcript_f>) {
+  chomp $line;
+  my @F = split / /, $line, 8;
+  push @{$TRANSCRIPT{$F[0]}}, \@F;
+
+  my $f1 = $F[0];
+  my $f2 = $F[1];
+  my $speaker = $F[2];
+  my $t1 = $F[5];
+  my $t2 = $F[6];
+
+  $time_width = max $time_width, length($t1), length($t2);
+  $speaker_width = max $speaker_width, length($speaker);
+  $audio_width = max $audio_width, length($f1);
+}
+close($transcript_f);
+#print Dumper(\%TRANSCRIPT);
+
+print STDERR $time_width . " " . $speaker_width . " " . $audio_width . "\n";
+
+my $sph2pipe = `which sph2pipe` or do {
+  die "$0: Error: sph2pipe is not installed. Did you run make in the tools/ directory?\n";
+};
+chomp $sph2pipe;
+
+open(my $wav_file, ">", "$out/wav.scp") 
+  or die "$0: Error: Cannot create file $out/wav.scp: $!\n";
+open(my $text_file, ">:encoding(utf-8)", "$out/text") 
+  or die "$0: Error: Cannot create file $out/text: $!\n";
+open(my $segments_file, ">", "$out/segments") 
+  or die "$0: Error: Cannot create file $out/segments: $!\n";
+open(my $spk_file, ">", "$out/utt2spk") 
+  or die "$0: Error: Cannot create file $out/utt2spk: $!\n";
+
+foreach my $file (sort keys %AUDIO) {
+  print "$0 Error: $file does not exist in transcripts!\n"  
+    unless exists $TRANSCRIPT{$file};
+  my $transcripts = $TRANSCRIPT{$file};
+
+  #my $file_fmt = sprintf("%0${audio_width}s", $file);
+  my $file_fmt = sprintf("%s", $file);
+
+  print $wav_file "$file_fmt $sph2pipe -f wav $AUDIO{$file}|\n";
+
+  foreach my $utt (@{$transcripts}) {
+    my $start = $utt->[5] + 0.0;  
+    my $end = $utt->[6] + 0.0;
+    if ($end - $start < 0.005) {   # remove very short segments
+      next;
+    }
+    my $start_time = sprintf("%0${time_width}d", $utt->[5]*1000);  
+    my $end_time = sprintf("%0${time_width}d", $utt->[6]*1000);
+    my $spk = sprintf("%0${speaker_width}s", $utt->[2]);
+    # my $spk = sprintf("%s", $utt->[2]);
+    my $spkid = "${file_fmt}_${spk}";
+    my $uttid = "${file_fmt}_${spk}_${start_time}_${end_time}";
+
+    print $text_file "$uttid $utt->[7]\n";
+    print $spk_file "$uttid $spkid\n";
+    print $segments_file "$uttid $file_fmt $start $end\n";
+  }
+}
+
+close($wav_file);
+close($text_file);
+close($segments_file);
+close($spk_file);
diff --git a/egs/hub4_english/s5/local/data_prep/format_1997_bn_data.pl b/egs/hub4_english/s5/local/data_prep/format_1997_bn_data.pl
new file mode 120000
index 00000000000..844c16bbe06
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/format_1997_bn_data.pl
@@ -0,0 +1 @@
+format_1996_bn_data.pl
\ No newline at end of file
diff --git a/egs/hub4_english/s5/local/data_prep/hub4_utils.py b/egs/hub4_english/s5/local/data_prep/hub4_utils.py
new file mode 100644
index 00000000000..4ee9eab1c7e
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/hub4_utils.py
@@ -0,0 +1,156 @@
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+"""This module contains utilities for preparing the HUB4 broadcast news
+evaluation corpora.
+"""
+
+import os
+import re
+import sys
+
+
+def parse_uem_line(reco, line):
+    """This method parses a 'line' from the UEM for recording 'reco'
+    and returns the line converted to kaldi segments format.
+    The format of UEM is
+    <file-id> <channel> <start-time> <end-time>
+
+    We force the channel to be 1 and take the file-id to be the recording-id.
+    """
+    line = line.strip()
+    if len(line) == 0 or line[0:2] == ";;":
+        return None
+    parts = line.split()
+
+    if reco is None:
+        reco = parts[0]
+
+    # The channel ID is expected to be 1.
+    if parts[1] != "1":
+        raise TypeError("Invalid line {0}".format(line))
+
+    start_time = float(parts[2])
+    end_time = float(parts[3])
+
+    utt = "{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100),
+                                       int(end_time * 100))
+    return "{0} {1} {2} {3}".format(utt, reco, start_time, end_time)
+
+
+def parse_cmu_seg_line(line, prepend_reco_to_spk=False):
+    """This line parses a 'line' from the CMU automatic segmentation for
+    recording.
+    The CMU segmentation has the following format:
+    <file> <channel> <speaker> <start-time> <end-time> <condition>
+
+    e.g.:
+    h4e_98_1 1 F0-0000     0.00    28.22 F0
+
+    We force the channel to be 1 and take the file-id to be the recording-id.
+    """
+    line = line.strip()
+    if len(line) == 0 or line[0:2] == ";;":
+        return None
+    parts = line.split()
+
+    # Actually a file, but we assuming 1-1 mapping to recording and force
+    # channel to be 1.
+    reco = parts[0]
+
+    # The channel ID is expected to be 1.
+    if parts[1] != "1":
+        raise TypeError("Invalid line {0}".format(line))
+    spk = parts[2]
+
+    start_time = float(parts[3])
+    end_time = float(parts[4])
+
+    if prepend_reco_to_spk:
+        spk = reco + '-' + spk
+        utt = "{spk}-{0:06d}-{1:06d}".format(int(start_time * 100),
+                                             int(end_time * 100), spk=spk)
+    else:
+        utt = "{spk}-{reco}-{0:06d}-{1:06d}".format(int(start_time * 100),
+                                                    int(end_time * 100),
+                                                    reco=reco, spk=spk)
+
+    segment_line = "{0} {1} {st:.3f} {end:.3f}".format(
+        utt, reco, st=start_time, end=end_time)
+    utt2spk_line = "{0} {1}".format(utt, spk)
+
+    return (segment_line, utt2spk_line)
+
+
+def normalize_csr_transcript(text, noise_word, spoken_noise_word):
+    """Normalize broadcast news transcript for audio."""
+    text = text.upper()
+
+    # Remove long event markings
+    text = re.sub(r"\[[^]/]+/\]|\[/[^]/]+\]", "", text)
+    # Remove comments
+    text = re.sub(r"\{\{[^}]*\}\}", "", text)
+    # Replace alternative words with a single one (second alternative)
+    text = re.sub(r"\{[^}/]+/([^}/]+)[^}]*\}", r"\1", text)
+    # Remove partial word completions
+    text = re.sub(r"\([^)]+\)-|-\([^)]+\)", "-", text)
+    # Remove accent marks and diacritics
+    text = re.sub(r"\\[3-8]", "", text)
+
+    # Remove unclear speech markings
+    text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text)
+    text = re.sub(r"#", "", text)   # Remove overlapped speech markings
+    # Remove invented word markings
+    text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
+    # Replace speaker-made noises with <SPOKEN_NOISE>
+    text = re.sub(r"\[INHALING\]|\[COUGH\]|\[THROAT_CLEARING\]|\[SIGN\]",
+                  spoken_noise_word, text)
+    # Replace noise with <NOISE>
+    text = re.sub(r"\[[^]]+\]", noise_word, text)
+    text = re.sub(r"\+([^+]+)\+", r"\1", text)
+
+    # Remove periods after letter.
+    text = re.sub(r"([A-Z])\.( |$)", r"\1 ", text)
+    # Replace \. with .
+    text = re.sub(r"\\.", r".", text)
+
+    text1 = []
+    for word in text.split():
+        if word == spoken_noise_word or word == noise_word:
+            text1.append(word)
+            continue
+
+        # Remove mispronunciation brackets
+        word = re.sub(r"^@(\w+)$", r"\1", word)
+        # Remove everything other than the standard ASCII symbols
+        word = re.sub("[^A-Za-z0-9.' _-]", "", word)
+        text1.append(word)
+    return " ".join(text1)
+
+
+def remove_punctuations(text):
+    """Remove punctuations and some other processing for text sentence."""
+    # Remove HTML new lines that are not end of sentences
+    text1 = re.sub("\n", " ", text)
+
+    # Remove some markers like double dash that are normally used to separate
+    # name titles in newspapers.
+    text1 = re.sub(r"(&[^;]+;|--)", " ", text1)
+
+    # Remove quotation marks
+    text1 = re.sub(r"''|``|\(|\)", " ", text1)
+
+    # Remove everything other than the standard ASCII symbols
+    text1 = re.sub("[^A-Za-z0-9.' _-]", "", text1)
+
+    # Replace multiple .'s with single and then remove isolated '.'
+    text1 = re.sub(r"\.[.]+ ", ".", text1)
+    text1 = re.sub(r" \. ", " ", text1)
+
+    # Remove isolated '-'
+    text1 = re.sub(r" - ", " ", text1)
+
+    # Replace multiple spaces with single.
+    text1 = re.sub(r"[ ]+", " ", text1)
+
+    return text1
diff --git a/egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl b/egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl
new file mode 100755
index 00000000000..3db0e1c71c3
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl
@@ -0,0 +1,28 @@
+#!/usr/bin/env perl
+
+# Copyright 2017  Vimal Manohar
+# Apache 2.0
+
+@ARGV == 2 ||  die "usage: normalize_bn96_transcripts.pl noise_word spoken_noise_word < transcript > transcript2";
+$noise_word = shift @ARGV;
+$spoken_noise_word = shift @ARGV;
+
+while(<STDIN>) {
+    $_ =~ m:^(\S+) (.+): || die "bad line $_";
+    $utt = $1;
+    $trans = $2;
+    print "$utt";
+
+    $trans =~ tr:a-z:A-Z:;
+    $trans =~ s:\(\(([^)]*)\)\):$1 :g;   # Remove unclear speech markings
+    $trans =~ s:#: :g; # Remove overlapped speech markings
+    $trans =~ s:\*\*([^*]+)\*\*:$1 :g;       # Remove invented word markings
+    $trans =~ s:\[[^]]+\]:$noise_word :g; 
+    $trans =~ s:\{[^}]+\}:$spoken_noise_word :g;
+    $trans =~ s:^[+]([^+]+)[+]$:$1:;   # Remove mispronunciation brackets
+    foreach $w (split (" ",$trans)) {
+        $w =~ s:^@(.*)$:$1:;  # Remove best guess marking for proper nouns
+        print " $w";
+    }
+    print "\n";
+}
diff --git a/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl b/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl
new file mode 100755
index 00000000000..b27f8da65f8
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl
@@ -0,0 +1,36 @@
+#!/usr/bin/env perl
+
+# Copyright 2017  Vimal Manohar
+# Apache 2.0
+
+@ARGV == 2 ||  die "usage: normalize_bn97_transcripts.pl noise_word spoken_noise_word < transcript > transcript2";
+$noise_word = shift @ARGV;
+$spoken_noise_word = shift @ARGV;
+
+while(<STDIN>) {
+    $_ =~ m:^(\S+) (.+): || die "bad line $_";
+    $utt = $1;
+    $trans = $2;
+    print "$utt";
+
+    $trans =~ tr:a-z:A-Z:;
+    $trans =~ s:\(\(([^)]*)\)\):$1 :g;   # Remove unclear speech markings
+    $trans =~ s:#: :g; # Remove overlapped speech markings
+    $trans =~ s:\*\*([^*]+)\*\*:$1 :g;       # Remove invented word markings
+    $trans =~ s:\[[^]]+\]:$noise_word :g; 
+    $trans =~ s:\{[^}]+\}:$spoken_noise_word :g;
+    $trans =~ s:^[+]([^+]+)[+]$:$1:;   # Remove mispronunciation brackets
+    foreach $w (split (" ",$trans)) {
+        if ($w ne $noise_word && $w ne $spoken_noise_word) {
+          $w =~ s:[?.,!]+$::;   # Remove punctuations
+          $w =~ s:^@(.*)$:$1:;  # Remove best guess marking for proper nouns
+          $w =~ s:^[\^](.*)$:$1:;  # Remove capitalization marks
+          $w =~ s:_([A-Z])'S$:$1.'S :g;  # Normalize abbreviations from _f_b_i to f. b. i.
+          $w =~ s:_([A-Z]):$1. :g;  # Normalize abbreviations from _f_b_i to f. b. i.
+          $w =~ s:[ ]+$::;  # Remove trailing spaces
+        }
+
+        print " $w";
+    }
+    print "\n";
+}
diff --git a/egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl b/egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl
new file mode 100755
index 00000000000..37487296809
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl
@@ -0,0 +1,229 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright (c) 2017  Johns Hopkins University (Author: Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+require HTML::Parser or die "This script needs HTML::Parser from CPAN";
+HTML::Parser->import();
+
+binmode(STDOUT, ":utf8");
+
+sub  trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s };
+
+sub parse_sgml_tag {
+  my $tag = shift(@_);
+  my %ret;
+  
+  if ($tag !~ /=/) {
+    return %ret;
+  }
+  
+  $tag =~ s/<[a-zA-Z]+ //;
+  $tag =~ s/> *$//;
+  #print $tag . "\n";
+
+  my @key_value_pairs = split / *,? +/, $tag;
+  for my $entry(@key_value_pairs) {
+    (my $key, my $value) = split '=', $entry, 2;
+    $ret{$key}=$value;
+  }
+  return %ret;
+}
+
+if (@ARGV != 1) {
+  print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n";
+  print STDERR "  Usage: $0 <transripts>\n";
+  print STDERR "  where\n";
+  print STDERR "    <transcripts> is a file containing the official SGML format\n";
+  print STDERR "      transcripts. The files are parsed and the parsed representation\n";
+  print STDERR "      is dumped to STDOUT (one utterance + the additional data fields\n";
+  print STDERR "      per line (we dump all the fields, but not all fields are used\n";
+  print STDERR "      in the recipe).\n";
+  die;
+}
+my $filelist=$ARGV[0];
+
+my $p = HTML::Parser->new();
+
+my @files=();
+open(F, '<', $filelist) or die "Could not open file $filelist: $?\n";
+while(<F>) {
+  chomp;
+  push @files, $_;
+}
+
+foreach my $file (@files) {
+  my $reporter="";
+  my $start = -1;
+  my $end = -1;
+  my $segment_start = -1;
+  my $segment_end = -1;
+  my $segment_speaker;
+  my $segment_fidelity = "XXX";
+  my $segment_mode = "XXX";
+  my $section_start = -1;
+  my $section_end = -1;
+  my $filename = "";
+  my $seq = 0;
+  my @text = ();
+  my $time;
+  my @tagqueue;
+
+  my $sgml_file = `basename $file`;
+  $sgml_file = trim $sgml_file;
+  $sgml_file =~ s/\.txt$//g;
+  $sgml_file =~ s/\.sgml$//g;
+  $sgml_file =~ s/_$//g;
+
+  open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n";
+
+  while(my $line = <$f>) {
+    chomp $line;
+    $line = trim $line;
+    $line = lc $line;
+    next unless $line;
+
+    if ($line =~ /<episode/) {
+      my %tags = parse_sgml_tag $line;
+      $filename = $tags{'filename'};
+      $filename =~ s/"//g;
+      $filename =~ s/\.sph//g;
+
+      if ($sgml_file ne $filename) {
+        print STDERR "$0: WARNING: SGML filename does not match episode filename $filename in file $file\n";
+      }
+      #print "BS: $line\n";
+      push @tagqueue, ["episode", \%tags];
+      ;
+    } elsif ($line =~ /<\/episode/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<section/) {
+      my %tags = parse_sgml_tag $line;
+
+      if ($tags{'s_time'}) {
+        $section_start = $tags{'s_time'};
+      } else {
+        $section_start = $tags{'starttime'};
+      }
+
+      if ($tags{'e_time'}) {
+        $section_end = $tags{'e_time'};
+      } else {
+        $section_end = $tags{'endtime'};
+      }
+
+      #print "BS: $line\n";
+      push @tagqueue, ["section", \%tags];
+      ;
+    } elsif ($line =~ /<\/section/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<segment/) { 
+      #print "BT: $line\n";
+      my %tags = parse_sgml_tag $line;
+      $segment_speaker = $tags{'speaker'};
+      $segment_speaker =~ s/"//g;
+      $segment_start = $tags{'s_time'};
+      $segment_end = $tags{'e_time'};
+      $segment_fidelity = $tags{'fidelity'} if $tags{'fidelity'};
+      $segment_mode = $tags{'mode'} if $tags{'mode'};
+      $time = $segment_start;
+      push @tagqueue, ["segment", \%tags];
+      ;
+    } elsif ($line =~ /<\/segment/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+
+      #print join(" ", @text) . "\n" if @text > 0;
+      my $new_time = $segment_end;
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = 0;
+      $segment_speaker = "XXX";
+      $segment_start = "XXX";
+      $segment_end = "XXX";
+      $segment_fidelity = "XXX";
+      $segment_mode = "XXX";
+      #print "ET: $line\n";
+      ;
+    } elsif ($line =~ /<sync/) {
+      my %tags = parse_sgml_tag $line;
+      my $new_time = $tags{'time'};
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = $new_time;
+      ;
+    } elsif ($line =~ /<\/sync/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<\/overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<background/ || $line =~ /<comment/) {
+      # ignore line;
+    } elsif ($line =~ /<foreign/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/foreign/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<unclear/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/unclear/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<[^\/]/) {
+      parse_sgml_tag $line;
+      print STDERR "$0: INFO: Unknown tag $line in file $file\n";
+    } elsif ($line =~ /<\//) {
+      ;
+    } else {
+      $line = trim $line;
+      push @text, $line if $line;
+      ;
+    }
+
+  }
+  close($f);
+}
diff --git a/egs/hub4_english/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl b/egs/hub4_english/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl
new file mode 100755
index 00000000000..fe5ea13779f
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl
@@ -0,0 +1,228 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright (c) 2017  Johns Hopkins University (Author: Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+require HTML::Parser or die "This script needs HTML::Parser from CPAN";
+HTML::Parser->import();
+
+binmode(STDOUT, ":utf8");
+
+sub  trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s };
+
+sub parse_sgml_tag {
+  my $tag = shift(@_);
+  my %ret;
+  
+  if ($tag !~ /=/) {
+    return %ret;
+  }
+  
+  $tag =~ s/<[a-zA-Z]+ //;
+  $tag =~ s/> *$//;
+  #print $tag . "\n";
+
+  my @key_value_pairs = split / *,? +/, $tag;
+  for my $entry(@key_value_pairs) {
+    (my $key, my $value) = split '=', $entry, 2;
+    $ret{$key}=$value;
+  }
+  return %ret;
+}
+
+if (@ARGV != 1) {
+  print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n";
+  print STDERR "  Usage: $0 <transripts>\n";
+  print STDERR "  where\n";
+  print STDERR "    <transcripts> is a file containing the official SGML format\n";
+  print STDERR "      transcripts. The files are parsed and the parsed representation\n";
+  print STDERR "      is dumped to STDOUT (one utterance + the additional data fields\n";
+  print STDERR "      per line (we dump all the fields, but not all fields are used\n";
+  print STDERR "      in the recipe).\n";
+  die;
+}
+my $filelist=$ARGV[0];
+
+my $p = HTML::Parser->new();
+
+my @files=();
+open(F, '<', $filelist) or die "Could not open file $filelist: $?\n";
+while(<F>) {
+  chomp;
+  push @files, $_;
+}
+
+foreach my $file (@files) {
+  my $reporter="";
+  my $start = -1;
+  my $end = -1;
+  my $segment_start = -1;
+  my $segment_end = -1;
+  my $segment_speaker;
+  my $segment_fidelity = "XXX";
+  my $segment_mode = "XXX";
+  my $section_start = -1;
+  my $section_end = -1;
+  my $filename = "";
+  my $seq = 0;
+  my @text = ();
+  my $time;
+  my @tagqueue;
+
+  my $sgml_file = `basename $file`;
+  $sgml_file = trim $sgml_file;
+  $sgml_file =~ s/\.txt$//g;
+  $sgml_file =~ s/\.sgml$//g;
+  $sgml_file =~ s/_$//g;
+
+  open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n";
+
+  while(my $line = <$f>) {
+    chomp $line;
+    $line = trim $line;
+    $line = lc $line;
+    next unless $line;
+
+    if ($line =~ /<episode/) {
+      my %tags = parse_sgml_tag $line;
+      $filename = $tags{'filename'};
+      $filename =~ s/"//g;
+      $filename =~ s/\.sph//g;
+
+      if ($sgml_file ne $filename) {
+        print STDERR "$0: WARNING: SGML filename does not match episode filename $filename in file $file\n";
+      }
+      #print "BS: $line\n";
+      push @tagqueue, ["episode", \%tags];
+      ;
+    } elsif ($line =~ /<\/episode/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<section/) {
+      my %tags = parse_sgml_tag $line;
+
+      if ($tags{'s_time'}) {
+        $section_start = $tags{'s_time'};
+      } else {
+        $section_start = $tags{'starttime'};
+      }
+
+      if ($tags{'e_time'}) {
+        $section_end = $tags{'e_time'};
+      } else {
+        $section_end = $tags{'endtime'};
+      }
+
+      #print "BS: $line\n";
+      push @tagqueue, ["section", \%tags];
+      ;
+    } elsif ($line =~ /<\/section/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<turn/) { 
+      #print "BT: $line\n";
+      my %tags = parse_sgml_tag $line;
+      $segment_speaker = $tags{'speaker'};
+      $segment_speaker =~ s/"//g;
+      $segment_start = $tags{'starttime'};
+      $segment_end = $tags{'endtime'};
+      $segment_fidelity = $tags{'fidelity'} if $tags{'fidelity'};
+      $segment_mode = $tags{'mode'} if $tags{'mode'};
+      $time = $segment_start;
+      push @tagqueue, ["turn", \%tags];
+      ;
+    } elsif ($line =~ /<\/turn/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+
+      #print join(" ", @text) . "\n" if @text > 0;
+      my $new_time = $segment_end;
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = 0;
+      $segment_speaker = "XXX";
+      $segment_start = "XXX";
+      $segment_end = "XXX";
+      $segment_fidelity = "XXX";
+      $segment_mode = "XXX";
+      #print "ET: $line\n";
+      ;
+    } elsif ($line =~ /<time/) {
+      my %tags = parse_sgml_tag $line;
+      my $new_time = $tags{'sec'};
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = $new_time;
+      ;
+    } elsif ($line =~ /<\/time/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<\/overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<background/ || $line =~ /<comment/) {
+      # ignore line;
+    } elsif ($line =~ /<foreign/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/foreign/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<unclear/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/unclear/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<[^\/]/) {
+      parse_sgml_tag $line;
+      print STDERR "$0: INFO: Unknown tag $line in file $file\n";
+    } elsif ($line =~ /<\//) {
+      ;
+    } else {
+      $line = trim $line;
+      push @text, $line if $line;
+      ;
+    }
+  }
+  close($f);
+}
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh b/egs/hub4_english/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh
new file mode 100755
index 00000000000..afa6d7e6531
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh
@@ -0,0 +1,63 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares the 1995 CSR-IV HUB4 corpus
+# https://catalog.ldc.upenn.edu/LDC96S31
+
+set -e
+set -o pipefail
+set -u
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <dir>"
+  echo " e.g.: $0 /export/corpora5/LDC/LDC96S31/csr95_hub4 data/local/data/csr95_hub4"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+for d in $SOURCE_DIR/csr95/h4/devtst $SOURCE_DIR/csr95/h4/evltst \
+  $SOURCE_DIR/csr95/h4/train; do
+  if [ ! -d $d ]; then
+    echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC96S31 corpus"
+    exit 1
+  fi
+done
+
+mkdir -p $dir
+
+for x in `ls $SOURCE_DIR/csr95/h4/*/*.wav`; do
+  y=`basename $x`
+  z=${y%.wav}
+  echo "$z $x"
+done > $dir/wav_scp
+
+cat $dir/wav_scp | grep "csr95/h4/train" > $dir/train95_wav_scp
+cat $dir/wav_scp | grep "csr95/h4/devtst" > $dir/dev95_wav_scp
+cat $dir/wav_scp | grep "csr95/h4/evltst" > $dir/eval95_wav_scp
+
+rm $dir/*_{segments,utt2spk,text} || true
+
+ls $SOURCE_DIR/csr95/h4/train/*.txt > $dir/train95_text.list
+ls $SOURCE_DIR/csr95/h4/devtst/*.txt > $dir/dev95_text.list
+ls $SOURCE_DIR/csr95/h4/evltst/*.txt > $dir/eval95_text.list
+
+for x in `ls $SOURCE_DIR/csr95/h4/*/*.txt`; do
+  if [[ $x =~ "csr95/h4/train" ]]; then
+    local/data_prep/process_1995_bn_annotation.py $x \
+      $dir/train95_segments $dir/train95_utt2spk $dir/train95_text
+  fi
+  
+  if [[ $x =~ "csr95/h4/devtst" ]]; then
+    local/data_prep/process_1995_bn_annotation.py $x \
+      $dir/dev95_segments $dir/dev95_utt2spk $dir/dev95_text
+  fi
+  
+  if [[ $x =~ "csr95/h4/evltst" ]]; then
+    local/data_prep/process_1995_bn_annotation.py $x \
+      $dir/eval95_segments $dir/eval95_utt2spk $dir/eval95_text
+  fi
+done
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1996_bn_data.sh b/egs/hub4_english/s5/local/data_prep/prepare_1996_bn_data.sh
new file mode 100755
index 00000000000..ea4e5699ce3
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1996_bn_data.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+# License: Apache 2.0
+
+# This script prepares the 1996 English Broadcast News (HUB4) corpus.
+# /export/corpora/LDC/LDC97S44 
+# /export/corpora/LDC/LDC97T22
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset             # Treat unset variables as an error
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <text-source> <speech-source> <out-dir>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC97T22/hub4_eng_train_trans /export/corpora/LDC/LDC97S44/data data/local/data/train_bn96"
+  exit 1
+fi
+
+text_source_dir=$1    # /export/corpora/LDC/LDC97T22/hub4_eng_train_trans
+speech_source_dir=$2  # /export/corpora/LDC/LDC97S44/data
+out=$3
+
+mkdir -p $out;
+
+ls $text_source_dir/*/*.txt > $out/text.list
+ls $speech_source_dir/*.sph > $out/audio.list
+
+if [ ! -s $out/text.list ] || [ ! -s $out/audio.list ]; then
+  echo "$0: Could not get text and audio files"
+  exit 1
+fi
+
+local/data_prep/parse_sgm_1996_hub4_eng.pl $out/text.list > \
+  $out/transcripts.txt 2> $out/parse_sgml.log || exit 1
+
+if [ ! -s $out/transcripts.txt ]; then
+  echo "$0: Could not parse SGML files in $out/text.list"
+  exit 1
+fi
+
+echo "$0: 1996 English Broadcast News training data (HUB4) prepared in $out"
+exit 0
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh b/egs/hub4_english/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh
new file mode 100755
index 00000000000..f3f9c939e0b
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh
@@ -0,0 +1,59 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares the 1996 CSR HUB4 Language Model corpus
+# https://catalog.ldc.upenn.edu/LDC98T31
+
+set -e
+set -o pipefail
+set -u
+
+nj=4
+cmd=run.pl
+stage=0
+
+[ -f ./path.sh ] && . ./path.sh
+
+. utils/parse_options.sh
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <dir>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC98T31/1996_csr_hub4_model data/local/data/csr96_hub4"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+mkdir -p $dir
+
+for d in $SOURCE_DIR/st_train/ $SOURCE_DIR/st_test/; do
+  if [ ! -d $d ]; then
+    echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC98T31 corpus"
+    exit 1
+  fi
+  ls $d/*.stZ 
+done | sort > $dir/filelist
+
+mkdir -p $dir/split$nj/
+
+if [ $stage -le 1 ]; then
+  eval utils/split_scp.pl $dir/filelist $dir/split$nj/filelist.{`seq -s, $nj`}
+  $cmd JOB=1:$nj $dir/log/process_text.JOB.log \
+    local/data_prep/process_1996_csr_hub4_lm_filelist.py \
+    $dir/split$nj/filelist.JOB $dir
+fi
+
+for x in `ls $SOURCE_DIR/st_train/*.stZ`; do
+  y=`basename $x`
+  name=${y%.stZ}
+  echo $dir/${name}.txt.gz
+done > $dir/train.filelist
+
+for x in `ls $SOURCE_DIR/st_test/*.stZ`; do
+  y=`basename $x`
+  name=${y%.stZ}
+  echo $dir/${name}.txt.gz
+done > $dir/test.filelist
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh b/egs/hub4_english/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh
new file mode 100755
index 00000000000..7c11531dda5
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh
@@ -0,0 +1,99 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares 1996 English Broadcast News Dev and Eval (HUB4)
+# https://catalog.ldc.upenn.edu/LDC97S66
+
+set -e
+set -o pipefail
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <dir>"
+  echo "$0 /export/corpora/LDC/LDC97S66/1996_eng_bcast_dev_eval data/local/data/hub4_96_dev_eval"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+mkdir -p $dir
+
+for d in $SOURCE_DIR/dev/devdata $SOURCE_DIR/eval/evaldata; do 
+  if [ ! -d $d ]; then
+    echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC97S66 corpus"
+    exit 1
+  fi
+done
+
+for d in dev eval; do 
+  if [ $d == "dev" ]; then
+    suffix=dt
+  else
+    suffix=ev
+  fi
+
+  python -c '
+import sys, os
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+uem = sys.argv[1]
+for line in open(uem).readlines():
+  line = hub4_utils.parse_uem_line(None, line)
+  if line is not None:
+    print (line)' $SOURCE_DIR/${d}/${d}data/h496${suffix}.uem > $dir/${d}96_uem_segments
+  awk '{print $1" "$2}' $dir/${d}96_uem_segments > $dir/${d}96_uem_utt2spk
+done 
+
+for d in dev eval; do 
+  if [ $d == "dev" ]; then
+    suffix=dt
+  else
+    suffix=ev
+  fi
+
+  cat $SOURCE_DIR/${d}/${d}data/h496${suffix}.pem | \
+    python -c '
+import sys
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f:
+  for line in sys.stdin.readlines():
+    tup = hub4_utils.parse_cmu_seg_line(line, prepend_reco_to_spk=True)
+    if tup is not None:
+      segments_line, utt2spk_line = tup
+      s_f.write("{0}\n".format(segments_line))
+      u_f.write("{0}\n".format(utt2spk_line))' \
+        $dir/${d}96_pem_segments $dir/${d}96_pem_utt2spk
+done
+ 
+export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; }
+
+for x in `ls $SOURCE_DIR/dev/devdata/*.sph`; do
+  y=`basename $x`
+  z=${y%.sph}
+  echo "$z $sph2pipe -f wav $x |";
+done > $dir/dev96_wav_scp
+
+cat $dir/dev96_pem_segments | awk '{print $2}' | \
+  utils/filter_scp.pl /dev/stdin $dir/dev96_wav_scp > $dir/dev96_pem_wav_scp
+cat $dir/dev96_uem_segments | awk '{print $2}' | \
+  utils/filter_scp.pl /dev/stdin $dir/dev96_wav_scp > $dir/dev96_uem_wav_scp
+
+for x in `ls $SOURCE_DIR/eval/evaldata/*.sph`; do
+  y=`basename $x`
+  z=${y%.sph}
+  echo "$z $sph2pipe -f wav $x |";
+done > $dir/eval96_wav_scp
+
+cp $SOURCE_DIR/eval/evaldata/et96_1.glm $dir/glm
+
+cp $SOURCE_DIR/eval/evaldata/et96_1.utm $dir/eval96_utm
+cp $SOURCE_DIR/dev/devdata/et96_1.utm $dir/dev96_utm
+
+cp $SOURCE_DIR/eval/evaldata/h496ev.stm $dir/eval96_stm
+
+cp $SOURCE_DIR/dev/devdata/h496dtpe.stm $dir/dev96_pem_stm
+cp $SOURCE_DIR/dev/devdata/h496dtue.stm $dir/dev96_uem_stm
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1997_bn_data.sh b/egs/hub4_english/s5/local/data_prep/prepare_1997_bn_data.sh
new file mode 100755
index 00000000000..5f049f7831c
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1997_bn_data.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+# License: Apache 2.0
+
+# This script prepares the 1997 English Broadcast News (HUB4) corpus.
+# /export/corpora/LDC/LDC98S71 
+# /export/corpora/LDC/LDC98T28
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset             # Treat unset variables as an error
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <text-source> <speech-source> <out-dir>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC98T28/hub4e97_trans_980217 /export/corpora/LDC/LDC98S71/97_eng_bns_hub4 data/local/data/train_bn97"
+  exit 1
+fi
+
+text_source_dir=$1    # /export/corpora/LDC/LDC98T28/hub4e97_trans_980217
+speech_source_dir=$2  # /export/corpora/LDC/LDC98S71/97_eng_bns_hub4
+out=$3
+
+mkdir -p $out;
+
+ls $text_source_dir/transcrp/*.sgml > $out/text.list
+ls $speech_source_dir/*.sph > $out/audio.list
+
+if [ ! -s $out/text.list ] || [ ! -s $out/audio.list ]; then
+  echo "$0: Could not get text and audio files"
+  exit 1
+fi
+
+local/data_prep/parse_sgm_1997_hub4_eng.pl $out/text.list > \
+  $out/transcripts.txt 2> $out/parse_sgml.log || exit 1
+
+if [ ! -s $out/transcripts.txt ]; then
+  echo "$0: Could not parse SGML files in $out/text.list"
+  exit 1
+fi
+
+echo "$0: 1997 English Broadcast News training data (HUB4) prepared in $out"
+exit 0
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh b/egs/hub4_english/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh
new file mode 100755
index 00000000000..1a0f6f8d372
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh
@@ -0,0 +1,65 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares 1997 HUB4 English Evaluation corpus
+# https://catalog.ldc.upenn.edu/LDC2002S11
+
+set -e
+set -o pipefail
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <dir>"
+  echo "$0 /export/corpora/LDC/LDC2002S11/hub4e_97 data/local/data/eval97"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+mkdir -p $dir
+
+if [ ! -d $SOURCE_DIR/h4e_evl/ ]; then
+  echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC2002S11 corpus"
+  exit 1
+fi
+
+for uem in $SOURCE_DIR/h4e_evl/h4e_97.uem; do
+  python -c '
+import sys, os
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+uem = sys.argv[1]
+reco, ext = os.path.splitext(os.path.basename(uem))
+for line in open(uem).readlines():
+  line = hub4_utils.parse_uem_line(reco, line)
+  if line is not None:
+    print (line)' $uem
+done > $dir/segments
+awk '{print $1" "$2}' $dir/segments > $dir/utt2spk
+
+cat $SOURCE_DIR/h4e_evl/h4e_97.seg | \
+  python -c '
+import sys
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f:
+  for line in sys.stdin.readlines():
+    tup = hub4_utils.parse_cmu_seg_line(line)
+    if tup is not None:
+      segments_line, utt2spk_line = tup
+      s_f.write("{0}\n".format(segments_line))
+      u_f.write("{0}\n".format(utt2spk_line))' \
+        $dir/segments.pem $dir/utt2spk.pem
+ 
+export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; }
+for x in `ls $SOURCE_DIR/h4e_evl/*.sph`; do
+  y=`basename $x`
+  z=${y%.sph}
+  echo "$z $sph2pipe -f wav $x |";
+done > $dir/wav.scp
+
+cp $SOURCE_DIR/h4e_evl/h4e_97_1.glm $dir/glm
+cp $SOURCE_DIR/h4e_evl/h4e_97.stm $dir/stm
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh b/egs/hub4_english/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
new file mode 100755
index 00000000000..3d9edf01579
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
@@ -0,0 +1,65 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares 1998 HUB4 Broadcast News Evaluation English Test Material
+# https://catalog.ldc.upenn.edu/LDC2000S86
+
+set -e
+set -o pipefail
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <dir>"
+  echo "$0 /export/corpora/LDC/LDC2000S86/ data/local/data/eval98"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+mkdir -p $dir
+
+if [ ! -d $SOURCE_DIR/h4e_evl/ ]; then
+  echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC2000S86 corpus"
+  exit 1
+fi
+
+for uem in $SOURCE_DIR/h4e_evl/h4e_98_{1,2}.uem; do
+  python -c '
+import sys, os
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+uem = sys.argv[1]
+reco, ext = os.path.splitext(os.path.basename(uem))
+for line in open(uem).readlines():
+  line = hub4_utils.parse_uem_line(reco, line)
+  if line is not None:
+    print (line)' $uem
+done > $dir/segments
+awk '{print $1" "$2}' $dir/segments > $dir/utt2spk
+
+cat $SOURCE_DIR/h4e_evl/h4e_98_{1,2}.seg | \
+  python -c '
+import sys
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f:
+  for line in sys.stdin.readlines():
+    tup = hub4_utils.parse_cmu_seg_line(line)
+    if tup is not None:
+      segments_line, utt2spk_line = tup
+      s_f.write("{0}\n".format(segments_line))
+      u_f.write("{0}\n".format(utt2spk_line))' \
+        $dir/segments.pem $dir/utt2spk.pem
+ 
+export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; }
+for x in `ls $SOURCE_DIR/h4e_evl/*.sph`; do
+  y=`basename $x`
+  z=${y%.sph}
+  echo "$z $sph2pipe -f wav $x |";
+done > $dir/wav.scp
+
+cp $SOURCE_DIR/h4e_evl/h4e_98.glm $dir/glm
+cp $SOURCE_DIR/h4e_evl/h4e_98.stm $dir/stm
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh b/egs/hub4_english/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh
new file mode 100755
index 00000000000..2d6a37228db
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh
@@ -0,0 +1,72 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares 1999 HUB4 Broadcast News Evaluation English Test Material
+# https://catalog.ldc.upenn.edu/LDC2000S88
+
+set -e 
+set -o pipefail
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <dir>"
+  echo "$0 /export/corpora5/LDC/LDC2000S88/hub4_1999 data/local/data/eval99"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+mkdir -p $dir
+
+if [ ! -d $SOURCE_DIR/bnews_99/ ]; then
+  echo "$0: Invalid SOURCE-DIR for LDC2000S88 corpus"
+  exit 1
+fi
+
+export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; }
+
+for f in bn99en_1 bn99en_2; do 
+  if [ "$f" == "bn99en_1" ]; then
+    affix=eval99_1
+  else
+    affix=eval99_2
+  fi
+
+  python -c '
+import sys, os
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+uem = sys.argv[1]
+reco, ext = os.path.splitext(os.path.basename(uem))
+for line in open(uem).readlines():
+  line = hub4_utils.parse_uem_line(reco, line)
+  if line is not None:
+    print (line)' $SOURCE_DIR/bnews_99/$f.uem > $dir/${affix}_uem_segments
+
+  awk '{print  $1" "$2}' $dir/${affix}_uem_segments > $dir/${affix}_uem_utt2spk
+
+  cat $SOURCE_DIR/bnews_99/$f.seg | \
+    python -c '
+import sys
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f:
+  for line in sys.stdin.readlines():
+    tup = hub4_utils.parse_cmu_seg_line(line)
+    if tup is not None:
+      segments_line, utt2spk_line = tup
+      s_f.write("{0}\n".format(segments_line))
+      u_f.write("{0}\n".format(utt2spk_line))' \
+        $dir/${affix}_pem_segments $dir/${affix}_pem_utt2spk
+  
+  echo "$f $sph2pipe -f wav $SOURCE_DIR/bnews_99/$f.sph |" > ${dir}/${affix}_wav_scp
+done 
+
+cp $SOURCE_DIR/bnews_99/en981118.glm $dir/eval99_1_glm
+cp $SOURCE_DIR/bnews_99/bn99en_1.stm $dir/eval99_1_stm
+
+cp $SOURCE_DIR/bnews_99/en991231.glm $dir/eval99_2_glm
+cp $SOURCE_DIR/bnews_99/bn99en_2.stm $dir/eval99_2_stm
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh
new file mode 100755
index 00000000000..9835d69a37e
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh
@@ -0,0 +1,63 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares the North American News Text Corpus
+# https://catalog.ldc.upenn.edu/LDC95T21
+
+[ -f ./path.sh ] && . ./path.sh
+. ./cmd.sh
+
+set -e
+set -o pipefail
+set -u
+
+nj=4
+cmd=run.pl
+
+. utils/parse_options.sh
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <DIR>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC95T21 data/local/data/na_news"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+dir_list=
+
+rm -f $dir/.error 2>/dev/null
+
+for x in $SOURCE_DIR/*/*/*; do
+  year=`basename $x`
+  newspaper=`basename $(dirname $x)`
+  d=$dir/${newspaper}_${year}
+
+  dir_list="$dir_list $d"
+
+  list_file=$d/articles.list
+  ls $x/*.gz > $list_file
+  
+  mkdir -p $d/split$nj
+
+  eval utils/split_scp.pl $d/articles.list \
+    $d/split$nj/articles.list.{`seq -s, $nj`}
+
+  $cmd JOB=1:$nj $d/log/get_processed_text.JOB.log \
+    local/data_prep/process_na_news_text.py $d/split$nj/articles.list.JOB \
+    $d/corpus.JOB.gz || touch $dir/.error &
+done
+
+wait
+
+if [ -f $dir/.error ]; then
+  echo "$0: Failed to process files."
+fi
+
+for d in $dir_list; do
+  gunzip -c $d/corpus.*.gz | gzip -c > $d/corpus.gz || exit 1
+  rm $d/corpus.*.gz
+done
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh
new file mode 100755
index 00000000000..f7f810c2326
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh
@@ -0,0 +1,72 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares the North American News Text Supplement Corpus
+# https://catalog.ldc.upenn.edu/LDC98T30
+
+[ -f ./path.sh ] && . ./path.sh
+. ./cmd.sh
+
+set -e
+set -o pipefail
+set -u
+
+nj=4
+cmd=run.pl
+
+. utils/parse_options.sh
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <DIR>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC98T30/northam_news_txt_sup data/local/data/na_news_supp"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+dir_list=
+
+rm -f $dir/.error 2>/dev/null
+
+for x in $SOURCE_DIR/nyt/*/ $SOURCE_DIR/latwp/ $SOURCE_DIR/apws/*/; do
+  year=`basename $x`
+  newspaper=`basename $(dirname $x)`
+
+  d=$dir/${newspaper}_${year}
+  
+  if [ $year == latwp ]; then
+    d=$dir/latwp_1997
+  elif [ $year == english ]; then
+    d=$dir/apws
+  fi
+
+  mkdir -p $d
+
+  dir_list="$dir_list $d"
+
+  list_file=$d/articles.list
+  ls $x/*.gz > $list_file
+  
+  mkdir -p $d/split$nj
+
+  eval utils/split_scp.pl $d/articles.list \
+    $d/split$nj/articles.list.{`seq -s, $nj`}
+
+  $cmd JOB=1:$nj $d/log/get_processed_text.JOB.log \
+    local/data_prep/process_na_news_text.py $d/split$nj/articles.list.JOB \
+    $d/corpus.JOB.gz || touch $dir/.error &
+done
+
+wait
+
+if [ -f $dir/.error ]; then
+  echo "$0: Failed to process files."
+fi
+
+for d in $dir_list; do
+  gunzip -c $d/corpus.*.gz | gzip -c > $d/corpus.gz || exit 1
+  rm $d/corpus.*.gz
+done
diff --git a/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py b/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py
new file mode 100755
index 00000000000..be0c7ad8e0d
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py
@@ -0,0 +1,273 @@
+#! /usr/bin/env python
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+"""This script process a 1995 CSR-IV annotation file and writes to
+utt2spk, segments and text files.
+"""
+
+from __future__ import print_function
+import argparse
+import os
+import logging
+import re
+from bs4 import BeautifulSoup
+import hub4_utils
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def get_args():
+    """Get command-line arguments"""
+
+    parser = argparse.ArgumentParser("Process 1995 CSR-IV HUB4 transcripts")
+
+    parser.add_argument("--noise-word", type=str, default="<NOISE>",
+                        help="Word to add in-place of noise words")
+    parser.add_argument("--spoken-noise-word", type=str,
+                        default="<SPOKEN_NOISE>",
+                        help="Word to add in-place of speaker noise words")
+    parser.add_argument("in_file", type=argparse.FileType('r'),
+                        help="Input transcript file")
+    parser.add_argument("segments_file", type=argparse.FileType('a'),
+                        help="Output segments file")
+    parser.add_argument("utt2spk_file", type=argparse.FileType('a'),
+                        help="Output utt2spk file")
+    parser.add_argument("text_file", type=argparse.FileType('a'),
+                        help="Output text file")
+
+    args = parser.parse_args()
+    return args
+
+
+class Segment(object):
+    """Class to store an utterance (segment)"""
+
+    def __init__(self, reco_id, spk=None, start_time=-1,
+                 end_time=-2, text=""):
+        """The arguments are straight-forward.
+        spk can be None if speaker is not known, in which case the utterance-id
+        and speaker-id are made the same.
+        end_time can be -1 to mean the end of the recording.
+        """
+        self.reco_id = reco_id
+        self.spk = spk
+        self.start_time = float(start_time)
+        self.end_time = float(end_time)
+        self.text = text
+
+    def get_utt_id(self):
+        """Return the utterance-id, which is
+        <recording-id>-<start-frame>-<end-frame> if spk is not known.
+        Otherwise it is speaker-id is added as a suffix to <recording-id>
+        above.
+        """
+        if self.spk is None:
+            return "{reco}-{0:06d}-{1:06d}".format(
+                int(self.start_time * 100), int(self.end_time * 100),
+                reco=self.reco_id)
+        return "{reco}-{spk}-{0:06d}-{1:06d}".format(
+            int(self.start_time * 100), int(self.end_time * 100),
+            reco=self.reco_id, spk=self.spk)
+
+    def get_spk_id(self):
+        """Returns the speaker-id appended to the recording-id, if speaker is
+        known. Otherwise returns the utterance-id as speaker-id.
+        """
+        if self.spk is None:
+            return "{reco}-{0:06d}-{1:06d}".format(
+                int(self.start_time * 100), int(self.end_time * 100),
+                reco=self.reco_id)
+        return "{reco}-{spk}".format(reco=self.reco_id, spk=self.spk)
+
+    def write_utt2spk(self, out_file):
+        """Writes this segment's entry into utt2spk file."""
+        print ("{0} {1}".format(self.get_utt_id(), self.get_spk_id()),
+               file=out_file)
+
+    def write_segment(self, out_file):
+        """Writes this segment's entry into segments file."""
+        print ("{0} {1} {2:.3f} {3:.3f}".format(
+                    self.get_utt_id(), self.reco_id,
+                    self.start_time, self.end_time),
+               file=out_file)
+
+    def write_text(self, out_file):
+        """Writes this segment's entry into kaldi text file."""
+        print ("{0} {1}".format(self.get_utt_id(), self.text),
+               file=out_file)
+
+
+def write_segments(segments, args):
+    """Write segments with non-empty transcripts."""
+    for segment in segments:
+        if len(segment.text) == 0:
+            continue
+        segment.write_utt2spk(args.utt2spk_file)
+        segment.write_segment(args.segments_file)
+        segment.write_text(args.text_file)
+
+
+def process_text(text, noise_word, spoken_noise_word):
+    """Returns normalized text"""
+    text = re.sub(r"\[pause\]", "", text)
+    text = hub4_utils.normalize_csr_transcript(text, noise_word,
+                                               spoken_noise_word)
+    return text
+
+
+test_spk_matcher = re.compile(r"(\S+)\(bt=(\S+)\set=(\S+)\):\s(.+)$")
+train_spk_matcher = re.compile(r"(\S+):\s(.+)$")
+
+
+def process_story_content(args, reco_id, content,
+                          start_time, end_time):
+    """Process the contents in a story and converts into a set of segments.
+
+    Arguments:
+        args -- A reference to the CLI arguments
+        reco_id -- Recording id
+        content -- A string containing all the contents of a story (or the
+                   stuff before the story like the credits and announcements).
+                   It is split on a double-newline characters.
+        start_time -- Start time of this 'story'.
+        end_time -- End time of this 'story'.
+    """
+
+    segments = []
+    segment_tmp = Segment(reco_id=reco_id, spk=None,
+                          start_time=start_time, end_time=-2, text="")
+
+    for line in content.split('\n\n'):
+        line = re.sub('\n', ' ', line)
+
+        if len(line) == 0 or re.match(r"\[[^]]+\]$|\s*$", line):
+            continue
+
+        m = test_spk_matcher.match(line)
+        if m:
+            # A line of story in test file that has start and end times
+            # and speaker name.
+            spk = m.group(1)
+            bt = float(m.group(2))
+            et = float(m.group(3))
+
+            # Once we know the end-time of the temporary segment, we can
+            # write that out (Only if it is non-empty).
+            if len(segment_tmp.text) > 0:
+                segment_tmp.end_time = bt
+            segments.append(segment_tmp)
+            segment_tmp = Segment(reco_id, spk=None, start_time=et)
+
+            text = process_text(m.group(4), args.noise_word,
+                                args.spoken_noise_word)
+            if len(text) == 0 or re.match(r"\[[^]]+\]$|\s*$", text):
+                continue
+            segments.append(Segment(reco_id=reco_id, spk=spk,
+                                    start_time=bt, end_time=et,
+                                    text=text))
+            continue
+
+        m = train_spk_matcher.match(line)
+        if m:
+            # A line of story in train file that has no time segment
+            # information. So speaker information is not useful.
+            text = process_text(m.group(2), args.noise_word,
+                                args.spoken_noise_word)
+        else:
+            # A line of story that does not have a speaker marking.
+            text = process_text(line, args.noise_word, args.spoken_noise_word)
+        if len(text) == 0 or re.match(r"\[[^]]+\]$|\s*$", text):
+            continue
+        segment_tmp.text += ' ' + text
+
+    if len(segment_tmp.text) > 0:
+        segment_tmp.end_time = end_time
+    segments.append(segment_tmp)
+
+    return segments
+
+
+def process_float(string):
+    string = re.sub(r"'|\"", "", string)
+    return float(string)
+
+
+def run(args):
+    base = os.path.basename(args.in_file.name)
+    reco_id = os.path.splitext(base)[0]
+
+    doc = ''.join(args.in_file.readlines())
+
+    soup = BeautifulSoup(doc, 'lxml')
+    for broadcast in soup.find_all('broadcast'):
+        non_story_contents = []
+        start_time = 0.0
+        end_time = -1.0
+        for s in broadcast.children:
+            try:
+                if s.name == 'story':
+                    story_begin_time = process_float(s['bt'])
+                    story_end_time = process_float(s['et'])
+                    for x in s.find_all('language') + s.find_all('sung'):
+                        x.replaceWithChildren()
+                    if len(non_story_contents):
+                        end_time = story_begin_time
+                        segments = process_story_content(
+                            args, reco_id, ' '.join(non_story_contents),
+                            start_time=start_time, end_time=end_time)
+                        write_segments(segments, args)
+                        non_story_contents = []
+                        start_time = story_end_time
+                    segments = process_story_content(
+                        args, reco_id,
+                        ' '.join([unicode(x) for x in s.children]),
+                        start_time=story_begin_time, end_time=story_end_time)
+                    write_segments(segments, args)
+                elif (s.name is not None and s.name != "language"
+                      and s.name != 'sung'):
+                    raise RuntimeError(
+                        "Expected a NavigableString or <story> "
+                        "or <language> or <sung>; got {0}".format(s))
+                elif s.name == "language" or s.name == "sung":
+                    non_story_contents.append(
+                        ' '.join([unicode(x) for x in s.children]))
+                else:
+                    non_story_contents.append(unicode(s))
+            except RuntimeError:
+                raise
+            except Exception:
+                logger.error("Failed to process broadcast children %s", s)
+                raise
+        # End for loop over broadcast children
+        if len(non_story_contents) > 0:
+            segments = process_story_content(
+                args, reco_id, ' '.join(non_story_contents),
+                start_time=start_time, end_time=-1)
+            write_segments(segments, args)
+
+
+def main():
+    try:
+        args = get_args()
+        run(args)
+    except Exception:
+        raise
+    finally:
+        for f in [args.in_file, args.segments_file,
+                  args.utt2spk_file, args.text_file]:
+            if f is not None:
+                f.close()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py b/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
new file mode 100755
index 00000000000..95aa7ddb831
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
@@ -0,0 +1,165 @@
+#! /usr/bin/env python
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+"""Prepare CSR-IV 1996 Language model text corpus (LDC98T31)."""
+
+from __future__ import print_function
+import argparse
+import gzip
+import logging
+import os
+import re
+import subprocess
+import sys
+
+from bs4 import BeautifulSoup
+
+sys.path.insert(0, 'steps')
+import libs.common as common_lib
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def get_args():
+    """Parses command-line arguments."""
+
+    parser = argparse.ArgumentParser("""Prepare CSR-IV 1996 Language model text
+    corpus (LDC98T31).""")
+    parser.add_argument("--verbose", choices=[0,1,2,3], type=int, default=0,
+                        help="Set higher for more verbose logging.")
+    parser.add_argument("file_list", type=str,
+                        help="""List of compressed source files""")
+    parser.add_argument("dir", type=str,
+                        help="Output directory to dump processed files to")
+
+    args = parser.parse_args()
+
+    if args.verbose > 2:
+        logger.setLevel(logging.DEBUG)
+        handler.setLevel(logging.DEBUG)
+
+    return args
+
+
+def normalize_text(text, remove_punct=False):
+    """Normalizes text and returns the normalized version.
+    The normalization involves converting text to upper case.
+    """
+    text1 = text.strip()
+    text2 = text1.upper()
+    text2 = re.sub(r" [ ]*", " ", text2)
+    text2 = re.sub(r"([A-Z][A-Z])[.!,;]\s", "\1", text2)  # remove punctuations
+    return text2
+
+
+def process_file_lines(lines, out_file_handle):
+    """Processes input lines from a file by removing SGML tags and
+    writes normalized plain text to output stream."""
+
+    doc = re.sub(r"<s>", "<s></s>", ''.join(lines))
+    if doc == '':
+        return False
+
+    soup = BeautifulSoup(doc, 'lxml')
+
+    num_written = 0
+
+    for art in soup.html.body.children:
+        try:
+            if art.name != "art":
+                continue
+            for para in art.find_all('p'):
+                assert para.name == 'p'
+
+                for x in para.contents:
+                    try:
+                        if x.name is None:
+                            normalized_text = normalize_text(unicode(x))
+                            if len(normalized_text) == 0:
+                                continue
+                            out_file_handle.write("{0}\n".format(
+                                normalized_text.encode('ascii')))
+                            num_written += 1
+                    except Exception:
+                        logger.error("Failed to process content %s in para "
+                                     "%s", x, para)
+                        raise
+
+        except Exception:
+            try:
+                logger.error("Failed to process article %s", art['id'])
+            except AttributeError:
+                logger.error("Failed to process body content %s", art)
+            raise
+    if num_written == 0:
+        raise RuntimeError("0 sentences written.")
+    return True
+
+
+def _run(args):
+    """The one that does it all."""
+
+    for line in open(args.file_list).readlines():
+        try:
+            file_ = line.strip()
+            base_name = os.path.basename(file_)
+            name = os.path.splitext(base_name)[0]
+
+            out_file = gzip.open("{0}/{1}.txt.gz".format(args.dir, name),
+                                 'w')
+
+            logger.info("Running LM pipefile for |%s|...", base_name)
+
+            command = (
+                "gunzip -c {0} | "
+                "tools/csr4_utils/pare-sgml.perl | "
+                "perl tools/csr4_utils/bugproc.perl | "
+                "perl tools/csr4_utils/numhack.perl | "
+                "perl tools/csr4_utils/numproc.perl "
+                "  -xtools/csr4_utils/num_excp | "
+                "perl tools/csr4_utils/abbrproc.perl "
+                "  tools/csr4_utils/abbrlist | "
+                "perl tools/csr4_utils/puncproc.perl -np"
+                "".format(file_))
+            logger.debug("Running command '%s'", command)
+
+            p = subprocess.Popen(command,
+                                 stdout=subprocess.PIPE, shell=True)
+
+            stdout = p.communicate()[0]
+            if p.returncode is not 0:
+                logger.error(
+                    "Command '%s' failed with return status %d",
+                    command, p.returncode)
+                raise RuntimeError
+
+            if not process_file_lines(stdout, out_file):
+                logger.warn("File %s empty or could not be processed.",
+                            file_)
+        except Exception:
+            logger.error("Failed processing file %s", file_)
+            raise
+
+
+def main():
+    """The main function"""
+    try:
+        args = get_args()
+        _run(args)
+    except Exception:
+        logger.error("Failed to process all files", exc_info=True)
+        sys.exit(1)
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/hub4_english/s5/local/data_prep/process_na_news_text.py b/egs/hub4_english/s5/local/data_prep/process_na_news_text.py
new file mode 100755
index 00000000000..94b02a766a9
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/process_na_news_text.py
@@ -0,0 +1,151 @@
+#! /usr/bin/env python
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+"""Prepare NA News Text Corpus (LDC95T21)
+or NA New Text Supplement Corpus (LDC98T30)."""
+
+from __future__ import print_function
+import argparse
+import gzip
+import logging
+import re
+import subprocess
+import sys
+
+from bs4 import BeautifulSoup
+
+sys.path.insert(0, 'local/data_prep')
+import hub4_utils
+
+sys.path.insert(0, 'steps')
+import libs.common as common_lib
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def get_args():
+    """Parses command-line arguments."""
+
+    parser = argparse.ArgumentParser("Prepare NA News Text corpus (LDC95T21).")
+    parser.add_argument("--verbose", type=int, choices=[0, 1, 2, 3], default=0,
+                        help="Use larger verbosity for more verbose logging.")
+    parser.add_argument("file_list", type=str,
+                        help="List of compressed source files for NA News Text. "
+                        "e.g: /export/corpora/LDC/LDC95T21/na_news_1/latwp/1994")
+    parser.add_argument("out_file", type=str,
+                        help="Output file to write to.")
+
+    args = parser.parse_args()
+
+    if args.verbose > 2:
+        logger.setLevel(logging.DEBUG)
+        handler.setLevel(logging.DEBUG)
+
+    return args
+
+
+def normalize_text(text):
+    """Normalizes text and returns the normalized version.
+    The normalization involves converting text to upper case.
+    """
+    text1 = text.strip()
+    text2 = hub4_utils.remove_punctuations(text1)
+    text2 = text2.upper()
+    return text2
+
+
+def process_file_lines(lines, out_file_handle):
+    """Processes input lines from a file by removing SGML tags and
+    writes normalized plain text to output stream."""
+    doc = ''
+    for line in lines:
+        line = re.sub(r"<artID>([^</])+</DOCID>", "", line)
+        line = re.sub(r"<p>", "<p></p>", line)
+        doc += line
+
+    if doc == '':
+        return False
+
+    soup = BeautifulSoup(doc, 'lxml')
+
+    num_written = 0
+
+    for art in soup.html.body.children:
+        try:
+            if art.name != "art":
+                continue
+            for para in art.find_all('p'):
+                assert para.name == 'p'
+                text = ' '.join([unicode(x).strip() for x in para.contents])
+                normalized_text = normalize_text(text)
+                out_file_handle.write("{0}\n".format(
+                    normalized_text.encode('ascii')))
+                num_written += 1
+        except:
+            logger.error("Failed to process document %s", doc)
+            raise
+    if num_written == 0:
+        raise RuntimeError("0 sentences written.")
+    return True
+
+
+def _run(args):
+    """The one that does it all."""
+
+    with gzip.open(args.out_file, 'w') as writer:
+        for line in open(args.file_list).readlines():
+            try:
+                file_ = line.strip()
+                command = (
+                    "gunzip -c {0} | "
+                    "tools/csr4_utils/pare-sgml.perl | "
+                    "perl tools/csr4_utils/bugproc.perl | "
+                    "perl tools/csr4_utils/numhack.perl | "
+                    "perl tools/csr4_utils/numproc.perl "
+                    "  -xtools/csr4_utils/num_excp | "
+                    "perl tools/csr4_utils/abbrproc.perl "
+                    "  tools/csr4_utils/abbrlist | "
+                    "perl tools/csr4_utils/puncproc.perl -np"
+                    "".format(file_))
+                logger.debug("Running command '%s'", command)
+
+                p = subprocess.Popen(command,
+                                     stdout=subprocess.PIPE, shell=True)
+
+                stdout = p.communicate()[0]
+                if p.returncode is not 0:
+                    logger.error(
+                        "Command '%s' failed with return status %d",
+                        command, p.returncode)
+                    raise RuntimeError
+
+                if not process_file_lines(stdout, writer):
+                    logger.warn("File %s empty or could not be processed.",
+                                file_)
+            except Exception:
+                logger.error("Failed processing file %s", file_)
+                raise
+
+
+def main():
+    """The main function"""
+    try:
+        args = get_args()
+        _run(args)
+    except Exception:
+        logger.error("Failed to process all files", exc_info=True)
+        sys.exit(1)
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/hub4_english/s5/local/dict b/egs/hub4_english/s5/local/dict
new file mode 120000
index 00000000000..384304fdf2a
--- /dev/null
+++ b/egs/hub4_english/s5/local/dict
@@ -0,0 +1 @@
+../../../wsj/s5/local/dict/
\ No newline at end of file
diff --git a/egs/hub4_english/s5/local/format_data.sh b/egs/hub4_english/s5/local/format_data.sh
new file mode 100755
index 00000000000..98e7eda08ab
--- /dev/null
+++ b/egs/hub4_english/s5/local/format_data.sh
@@ -0,0 +1,133 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+set -e
+set -o pipefail
+
+echo "$0 $@"  # Print the command line for logging
+
+noise_word="<NOISE>"
+spoken_noise_word="<SPOKEN_NOISE>"
+
+. utils/parse_options.sh || exit 1;
+
+. ./path.sh || exit 1;
+
+if [ $# -ne 0 ]; then
+  echo "Usage: $0"
+  exit 1
+fi
+
+srcdir=data/local/data
+tmpdir=data/local/
+
+export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+
+###############################################################################
+# Format 1996 English Broadcast News Train (HUB4)
+###############################################################################
+mkdir -p data/train_bn96
+
+local/data_prep/format_1996_bn_data.pl \
+  $srcdir/train_bn96/audio.list $srcdir/train_bn96/transcripts.txt \
+  data/train_bn96 || exit 1
+
+mv data/train_bn96/text data/train_bn96/text.unnorm
+local/data_prep/normalize_bn96_transcripts.pl $noise_word $spoken_noise_word \
+  < data/train_bn96/text.unnorm > data/train_bn96/text
+
+###############################################################################
+# Format 1997 English Broadcast News Train (HUB4)
+###############################################################################
+mkdir -p data/train_bn97
+
+local/data_prep/format_1997_bn_data.pl \
+  $srcdir/train_bn97/audio.list $srcdir/train_bn97/transcripts.txt \
+  data/train_bn97 || exit 1
+
+mv data/train_bn97/text data/train_bn97/text.unnorm
+local/data_prep/normalize_bn97_transcripts.pl $noise_word $spoken_noise_word \
+  < data/train_bn97/text.unnorm > data/train_bn97/text
+
+###############################################################################
+# Format 1996 English Broadcast News Dev (HUB4)
+###############################################################################
+mkdir -p data/dev96pe 
+mkdir -p data/dev96ue
+
+cp $srcdir/hub4_96_dev_eval/dev96_uem_segments data/dev96ue/segments
+cp $srcdir/hub4_96_dev_eval/dev96_uem_utt2spk data/dev96ue/utt2spk
+cp $srcdir/hub4_96_dev_eval/dev96_uem_wav_scp data/dev96ue/wav.scp
+cp $srcdir/hub4_96_dev_eval/dev96_uem_stm data/dev96ue/stm
+cp $srcdir/hub4_96_dev_eval/glm data/dev96ue/glm
+
+awk '{if ($4 > $3) print $0}' $srcdir/hub4_96_dev_eval/dev96_pem_segments \
+  > data/dev96pe/segments
+cp $srcdir/hub4_96_dev_eval/dev96_pem_utt2spk data/dev96pe/utt2spk
+cp $srcdir/hub4_96_dev_eval/dev96_pem_wav_scp data/dev96pe/wav.scp
+cp $srcdir/hub4_96_dev_eval/dev96_pem_stm data/dev96pe/stm
+cp $srcdir/hub4_96_dev_eval/glm data/dev96pe/glm
+
+###############################################################################
+# Format 1996 English Broadcast News Eval (HUB4)
+###############################################################################
+mkdir -p data/eval96
+mkdir -p data/eval96.pem 
+
+cp $srcdir/hub4_96_dev_eval/eval96_pem_segments data/eval96.pem/segments
+cp $srcdir/hub4_96_dev_eval/eval96_pem_utt2spk data/eval96.pem/utt2spk
+cp $srcdir/hub4_96_dev_eval/eval96_wav_scp data/eval96.pem/wav.scp
+cp $srcdir/hub4_96_dev_eval/eval96_stm data/eval96.pem/stm
+cp $srcdir/hub4_96_dev_eval/glm data/eval96.pem/glm
+
+cp $srcdir/hub4_96_dev_eval/eval96_uem_segments data/eval96/segments
+cp $srcdir/hub4_96_dev_eval/eval96_uem_utt2spk data/eval96/utt2spk
+cp $srcdir/hub4_96_dev_eval/eval96_wav_scp data/eval96/wav.scp
+cp $srcdir/hub4_96_dev_eval/eval96_stm data/eval96/stm
+cp $srcdir/hub4_96_dev_eval/glm data/eval96/glm
+
+###############################################################################
+# Format 1997-98 Hub4 Broadcast news evalutation
+###############################################################################
+for t in eval97 eval98; do
+  mkdir -p data/$t data/${t}.pem
+  cp $srcdir/$t/segments data/$t/segments
+  cp $srcdir/$t/utt2spk data/$t/utt2spk
+  cp $srcdir/$t/segments.pem data/${t}.pem/segments
+  cp $srcdir/$t/utt2spk.pem data/${t}.pem/utt2spk
+  cp $srcdir/$t/wav.scp data/$t/wav.scp
+  cp $srcdir/$t/wav.scp data/${t}.pem/wav.scp
+  cp $srcdir/$t/stm data/$t/stm
+  cp $srcdir/$t/stm data/${t}.pem/stm
+  cp $srcdir/$t/glm data/$t/glm
+  cp $srcdir/$t/glm data/${t}.pem/glm
+done
+
+###############################################################################
+# Format 1999 Hub4 Broadcast news evalutation
+###############################################################################
+for d in eval99_1 eval99_2; do
+  mkdir -p data/${d} data/${d}.pem
+  cp $srcdir/eval99/${d}_uem_segments data/${d}/segments
+  cp $srcdir/eval99/${d}_uem_utt2spk data/${d}/utt2spk
+  cp $srcdir/eval99/${d}_pem_segments data/${d}.pem/segments
+  cp $srcdir/eval99/${d}_pem_utt2spk data/${d}.pem/utt2spk
+  cp $srcdir/eval99/${d}_wav_scp data/${d}/wav.scp
+  cp $srcdir/eval99/${d}_wav_scp data/${d}.pem/wav.scp
+  cp $srcdir/eval99/${d}_stm data/${d}/stm
+  cp $srcdir/eval99/${d}_stm data/${d}.pem/stm
+  cp $srcdir/eval99/${d}_glm data/${d}/glm
+  cp $srcdir/eval99/${d}_glm data/${d}.pem/glm
+done
+
+for d in train_bn96 train_bn97 eval96 eval96.pem dev96pe dev96ue eval97 eval97.pem \
+         eval98 eval98.pem eval99_1 eval99_1.pem eval99_2 eval99_2.pem; do
+  utils/utt2spk_to_spk2utt.pl data/$d/utt2spk > data/$d/spk2utt
+  awk '{print $1" "$1" 1"}' data/${d}/wav.scp > \
+    data/${d}/reco2file_and_channel
+  utils/fix_data_dir.sh data/${d}
+done
+
+utils/combine_data.sh data/train data/train_bn96 data/train_bn97
diff --git a/egs/hub4_english/s5/local/format_lms.sh b/egs/hub4_english/s5/local/format_lms.sh
new file mode 100755
index 00000000000..1d18209aa60
--- /dev/null
+++ b/egs/hub4_english/s5/local/format_lms.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+#
+# Copyright  2014 Nickolay V. Shmyrev
+# Apache 2.0
+
+[ -f ./path.sh ] && . ./path.sh
+
+set -e -o pipefail -u
+
+lang_suffix=_test
+local_lm_dir=data/local/local_lm
+
+. utils/parse_options.sh
+
+#arpa_lm=$local_lm_dir/data/arpa/4gram.arpa.gz
+small_arpa_lm=$local_lm_dir/data/arpa/4gram_small.arpa.gz
+big_arpa_lm=$local_lm_dir/data/arpa/4gram_big.arpa.gz
+
+for f in $small_arpa_lm $big_arpa_lm data/lang_nosp/words.txt; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+set -e
+
+cp -rT data/lang_nosp/ data/lang_nosp${lang_suffix}
+
+if [ -f data/lang_nosp${lang_suffix}/G.fst ] && [ data/lang_nosp${lang_suffix}/G.fst -nt $small_arpa_lm ]; then
+  echo "$0: not regenerating data/lang_nosp${lang_suffix}/G.fst as it already exists and "
+  echo ".. is newer than the source LM."
+else
+  arpa2fst --disambig-symbol=#0 --read-symbol-table=data/lang_nosp/words.txt \
+    "gunzip -c $small_arpa_lm|" data/lang_nosp${lang_suffix}/G.fst
+  echo  "$0: Checking how stochastic G is (the first of these numbers should be small):"
+  fstisstochastic data/lang_nosp${lang_suffix}/G.fst || true
+  utils/validate_lang.pl --skip-determinization-check data/lang_nosp${lang_suffix}
+fi
+
+
+if [ -f data/lang_nosp${lang_suffix}_rescore/G.carpa ] && [ data/lang_nosp${lang_suffix}_rescore/G.carpa -nt $big_arpa_lm ] && \
+    [ data/lang_nosp${lang_suffix}_rescore/G.carpa -nt data/lang_nosp/words.txt ]; then
+  echo "$0: not regenerating data/lang_nosp_rescore/ as it seems to already by up to date."
+else
+  utils/build_const_arpa_lm.sh $big_arpa_lm data/lang_nosp \
+    data/lang_nosp${lang_suffix}_rescore || exit 1;
+fi
+
+exit 0;
diff --git a/egs/hub4_english/s5/local/lm/merge_word_counts.py b/egs/hub4_english/s5/local/lm/merge_word_counts.py
new file mode 100755
index 00000000000..6338cbbf875
--- /dev/null
+++ b/egs/hub4_english/s5/local/lm/merge_word_counts.py
@@ -0,0 +1,30 @@
+#! /usr/bin/env python
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+"""This script merges pocolm word_counts and write a new word_counts file.
+A min-count argument is required to only write counts that are above the
+specified minimum count.
+"""
+
+import sys
+
+
+def main():
+    if len(sys.argv) != 2:
+        sys.stderr.write("Usage: {0} <min-count>\n".format(sys.argv[0]))
+        raise SystemExit(1)
+
+    words = {}
+    for line in sys.stdin.readlines():
+        parts = line.strip().split()
+        words[parts[1]] = words.get(parts[1], 0) + int(parts[0])
+
+    for word, count in words.iteritems():
+        if count >= int(sys.argv[1]):
+            print ("{0} {1}".format(count, word))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/hub4_english/s5/local/normalize_transcripts.pl b/egs/hub4_english/s5/local/normalize_transcripts.pl
new file mode 120000
index 00000000000..5f1261ccd79
--- /dev/null
+++ b/egs/hub4_english/s5/local/normalize_transcripts.pl
@@ -0,0 +1 @@
+data_prep/normalize_bn96_transcripts.pl
\ No newline at end of file
diff --git a/egs/hub4_english/s5/local/prepare_dict.sh b/egs/hub4_english/s5/local/prepare_dict.sh
new file mode 100755
index 00000000000..3f53ec6af74
--- /dev/null
+++ b/egs/hub4_english/s5/local/prepare_dict.sh
@@ -0,0 +1,146 @@
+#!/bin/bash
+
+# Copyright 2010-2012 Microsoft Corporation
+#           2012-2014 Johns Hopkins University (Author: Daniel Povey)
+#                2015 Guoguo Chen
+#                2016 Vimal Manohar
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# Call this script from one level above, e.g. from the s3/ directory.  It puts
+# its output in data/local/.
+
+# The parts of the output of this that will be needed are
+# [in data/local/dict/ ]
+# lexicon.txt
+# extra_questions.txt
+# nonsilence_phones.txt
+# optional_silence.txt
+# silence_phones.txt
+
+[ -f ./path.sh ] && . ./path.sh
+. ./cmd.sh
+
+set -e
+set -o pipefail
+set -u
+
+# run this from ../
+dict_suffix=
+stage=-1
+
+echo "$0 $@"  # Print the command line for logging
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 <wordlist>"
+  echo "e.g. : $0 data/local/local_lm/data/work/wordlist"
+  exit 1
+fi
+
+wordlist=$1
+
+dir=data/local/dict${dict_suffix}
+mkdir -p $dir
+
+if [ ! -d $dir/cmudict ]; then
+  # (1) Get the CMU dictionary
+  svn co  https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
+    $dir/cmudict || exit 1;
+fi
+
+cp $wordlist $dir/orig_wordlist
+
+# can add -r 10966 for strict compatibility.
+
+#(2) Dictionary preparation:
+
+
+if [ $stage -le 0 ]; then
+  # Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
+  # We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.
+
+  # silence phones, one per line.
+  (echo SIL; echo SPN; echo NSN; echo UNK;) > $dir/silence_phones.txt
+  echo SIL > $dir/optional_silence.txt
+
+  # nonsilence phones; on each line is a list of phones that correspond
+  # really to the same base phone.
+  cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \
+    perl -e 'while(<>){
+  chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
+  $phones_of{$1} .= "$_ "; }
+  foreach $list (values %phones_of) {print $list . "\n"; } ' \
+    > $dir/nonsilence_phones.txt || exit 1;
+
+  # A few extra questions that will be added to those obtained by automatically clustering
+  # the "real" phones.  These ask about stress; there's also one for silence.
+  cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
+  cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
+  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
+    >> $dir/extra_questions.txt || exit 1;
+
+  grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
+    perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
+    > $dir/dict.cmu || exit 1;
+
+  # Add to cmudict the silences, noises etc.
+
+  (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<unk> UNK'; echo '<NOISE> NSN'; ) | \
+    cat - $dir/dict.cmu > $dir/lexicon2_raw.txt
+  awk '{print $1}' $dir/lexicon2_raw.txt > $dir/wordlist_with_prons
+
+  cat <<EOF >$dir/silence_phones.txt
+SIL
+SPN
+NSN
+UNK
+EOF
+
+fi
+
+
+if [ $stage -le 2 ]; then
+  if [ ! -f exp/g2p/.done ]; then
+    steps/dict/train_g2p.sh --cmd "$train_cmd" \
+      --silence-phones $dir/silence_phones.txt \
+      $dir/dict.cmu exp/g2p
+    touch exp/g2p/.done
+  fi
+fi
+
+export PATH=$PATH:`pwd`/local/dict
+
+if [ $stage -le 3 ]; then
+  utils/filter_scp.pl --exclude $dir/wordlist_with_prons < $dir/orig_wordlist | \
+    sort -u > $dir/oovlist
+fi
+
+if [ $stage -le 7 ]; then
+  steps/dict/apply_g2p.sh --cmd "$train_cmd" \
+    $dir/oovlist exp/g2p exp/g2p/oov_lex
+  cat exp/g2p/oov_lex/lexicon.lex | cut -f 1,3 | awk '{if (NF > 1) print $0}' > \
+    $dir/dict.oovs_g2p
+fi
+
+if [ $stage -le 8 ]; then
+  # the sort | uniq is to remove a duplicated pron from cmudict.
+  cat $dir/lexicon2_raw.txt $dir/dict.oovs_g2p | sort | uniq > \
+    $dir/lexicon.txt || exit 1;
+  # lexicon.txt is without the _B, _E, _S, _I markers.
+
+  rm $dir/lexiconp.txt 2>/dev/null || true
+fi
+
+echo "Dictionary preparation succeeded"
diff --git a/egs/hub4_english/s5/local/run_cleanup_segmentation.sh b/egs/hub4_english/s5/local/run_cleanup_segmentation.sh
new file mode 100755
index 00000000000..e91ec318650
--- /dev/null
+++ b/egs/hub4_english/s5/local/run_cleanup_segmentation.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+
+# Copyright 2016  Vimal Manohar
+#           2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script demonstrates how to re-segment training data selecting only the
+# "good" audio that matches the transcripts.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# biased language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+
+stage=0
+cleanup_stage=0
+data=data/train
+cleanup_affix=cleaned
+srcdir=exp/tri3
+nj=100
+decode_nj=16
+decode_num_threads=4
+
+. ./path.sh
+. ./cmd.sh
+
+set -e
+set -o pipefail
+set -u
+
+. utils/parse_options.sh
+
+cleaned_data=${data}_${cleanup_affix}
+
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+if [ $stage -le 1 ]; then
+  # This does the actual data cleanup.
+  steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \
+    $data data/lang_nosp $srcdir $dir $cleaned_data
+fi
+
+if [ $stage -le 2 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
+fi
+
+if [ $stage -le 3 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" \
+    4200 40000 $cleaned_data data/lang_nosp ${srcdir}_ali_${cleanup_affix} ${cleaned_dir}
+fi
+
+if [ $stage -le 4 ]; then
+  # Test with the model trained on cleaned-up data.
+  utils/mkgraph.sh data/lang_nosp_test ${cleaned_dir} ${cleaned_dir}/graph_nosp
+
+  for dset in eval97.pem eval98.pem eval99_1.pem eval99_2.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt $decode_nj ]; then
+      this_nj=$decode_nj
+    fi
+    steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \
+       --cmd "$decode_cmd" \
+       ${cleaned_dir}/graph_nosp data/${dset} ${cleaned_dir}/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp_test data/lang_nosp_test_rescore \
+       data/${dset} ${cleaned_dir}/decode_nosp_${dset} ${cleaned_dir}/decode_nosp_${dset}_rescore
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data data/lang_nosp ${cleaned_dir} ${cleaned_dir}_ali_${cleanup_affix}
+fi
+
+if [ $stage -le 6 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" \
+    5000 100000 $cleaned_data data/lang_nosp \
+    ${cleaned_dir}_ali_${cleanup_affix} exp/tri4_${cleanup_affix}
+fi
+
+cleaned_dir=exp/tri4_${cleanup_affix}
+if [ $stage -le 7 ]; then
+  # Test with the larger model trained on cleaned-up data.
+  utils/mkgraph.sh data/lang_nosp_test ${cleaned_dir} ${cleaned_dir}/graph_nosp
+
+  for dset in eval97.pem eval98.pem eval99_1.pem eval99_2.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt $decode_nj ]; then
+      this_nj=$decode_nj
+    fi
+    steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \
+       --cmd "$decode_cmd"  \
+       ${cleaned_dir}/graph_nosp data/${dset} ${cleaned_dir}/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp_test data/lang_nosp_test_rescore \
+       data/${dset} ${cleaned_dir}/decode_nosp_${dset} ${cleaned_dir}/decode_nosp_${dset}_rescore
+  done
+fi
diff --git a/egs/hub4_english/s5/local/run_segmentation_wsj.sh b/egs/hub4_english/s5/local/run_segmentation_wsj.sh
new file mode 100755
index 00000000000..a321abe9a29
--- /dev/null
+++ b/egs/hub4_english/s5/local/run_segmentation_wsj.sh
@@ -0,0 +1,313 @@
+#!/bin/bash
+
+# Copyright 2016-18  Vimal Manohar
+# Apache 2.0
+
+set -e
+set -o pipefail
+
+# This script demonstrates how to use out-of-domain WSJ models to segment long
+# audio recordings of HUB4 with raw unaligned transcripts into short segments
+# with aligned transcripts for training new ASR models. 
+
+# The overall procedure is as follow:
+# 1) Train a GMM on out-of-domain WSJ corpus
+# 2) Decode broadcast news recordings (HUB4) with WSJ GMM and 4-gram biased LM 
+#    trained on the raw unprocessed transcript. 
+# 3) Use the CTM output to segment the recordings keep the best matched
+#    audio and text.
+# 4) Train an in-domain GMM on the above data. 
+# 5) Repeat steps 2, 3 and 4 using the new in-domain GMM.
+# 6) Re-segment the data retaining only the "clean" part of the data.
+
+# See the script steps/cleanup/segment_long_utterances.sh for details about 
+# audio-transcript alignment (Step 2, 3)
+# See the script steps/cleanup/clean_and_segment_data.sh for details about 
+# cleaning up transcripts (Step 6)
+
+# In step 3, if you need to align the full hypothesis of audio with the 
+# reference text as opposed to finding the best matching substring, 
+# then use --align-full-hyp true in the scripts below.
+
+# WSJ models (From step 1)
+# %WER 29.9 | 728 32834 | 72.9 17.8 9.3 2.8 29.9 92.7 | exp/wsj_tri3/decode_nosp_eval97.pem_rescore/score_16_0.0/eval97.pem.ctm.filt.sys
+# %WER 30.8 | 728 32834 | 71.8 18.4 9.8 2.6 30.8 92.3 | exp/wsj_tri3/decode_nosp_eval97.pem/score_17_0.0/eval97.pem.ctm.filt.sys
+
+# In-domain GMM (From step 4) -- 107 hrs
+# %WER 19.1 | 728 32834 | 82.7 12.2 5.1 1.9 19.1 86.4 | exp/tri4_a/decode_nosp_eval97.pem_rescore/score_14_1.0/eval97.pem.ctm.filt.sys
+# %WER 20.4 | 728 32834 | 81.6 13.1 5.3 2.1 20.4 87.4 | exp/tri4_a/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
+
+# Stage 2 in-domain GMM (From step 5) -- 124 hrs
+# %WER 20.9 | 728 32834 | 81.2 13.6 5.3 2.1 20.9 87.4 | exp/tri4_2a/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
+# %WER 19.8 | 728 32834 | 82.3 12.9 4.7 2.2 19.8 86.1 | exp/tri4_2a/decode_nosp_eval97.pem_rescore/score_12_0.5/eval97.pem.ctm.filt.sys
+
+# GMM trained on cleaned transcripts (From step 6) -- 120 hrs
+# %WER 18.4 | 728 32834 | 83.6 11.9 4.5 2.1 18.4 84.8 | exp/tri5_2a_cleaned/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys
+# %WER 19.6 | 728 32834 | 82.5 12.7 4.8 2.2 19.6 86.8 | exp/tri5_2a_cleaned/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+
+# Oracle HUB4 transcripts -- 148 hrs
+# %WER 17.8 | 728 32834 | 84.1 11.8 4.1 1.9 17.8 82.8 | exp/tri4/decode_nosp_eval97.pem_rescore/score_13_0.5/eval97.pem.ctm.filt.sys
+# %WER 19.0 | 728 32834 | 83.0 12.7 4.3 2.0 19.0 84.2 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+
+stage=0
+segment_stage=-8
+nj=40
+reco_nj=80
+stage1_affix=a    # For steps 2, 3 and 4 above
+stage2_affix=2a   # For step 5 above
+
+# WSJ run.sh must be run until the data preparation stage
+wsj_base=../../wsj/s5   # Change this to the WSJ base directory
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./cmd.sh
+
+. utils/parse_options.sh
+
+if [ ! -f $wsj_base/data/train_si284/wav.scp ]; then
+  echo "WSJ data directory $wsj_base/data/train_si284 is not prepared."
+  echo "Run the initial stages of WSJ's run.sh"
+  exit 0
+fi
+
+if [ $stage -le 0 ]; then
+  # We copy the prepared data to the current directory
+  utils/copy_data_dir.sh $wsj_base/data/train_si84_2kshort data/train_si84_2kshort
+  utils/copy_data_dir.sh $wsj_base/data/train_si84 data/train_si84
+  utils/copy_data_dir.sh $wsj_base/data/train_si284 data/train_si284
+fi
+
+###############################################################################
+## Simulate unsegmented HUB4 data directory.
+###############################################################################
+
+if [ $stage -le 1 ]; then
+  utils/data/convert_data_dir_to_whole.sh data/train data/train_long
+
+  steps/make_mfcc.sh --cmd "$train_cmd --max-jobs-run 40" \
+    --nj $reco_nj --write-utt2num-frames true \
+    data/train_long exp/make_mfcc/train_long mfcc
+  steps/compute_cmvn_stats.sh data/train_long \
+    exp/make_mfcc/train_long mfcc
+  utils/fix_data_dir.sh data/train_long
+fi
+
+###############################################################################
+## Train GMM on out-of-domain WSJ corpus 
+###############################################################################
+
+if [ $stage -le 2 ]; then
+  steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+    data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a
+fi
+
+if [ $stage -le 3 ]; then
+  steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+    data/train_si84 data/lang_nosp exp/wsj_mono0a exp/wsj_mono0a_ali_si84
+
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2500 15000 \
+    data/train_si84 data/lang_nosp exp/wsj_mono0a_ali_si84 exp/wsj_tri1
+fi
+
+if [ $stage -le 4 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train_si284 data/lang_nosp exp/wsj_tri1 exp/wsj_tri1_ali_si284
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=3 --right-context=3" 4000 42000 \
+    data/train_si284 data/lang_nosp exp/wsj_tri1_ali_si284 exp/wsj_tri2
+fi
+
+if [ $stage -le 5 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train_si284 data/lang_nosp exp/wsj_tri2 exp/wsj_tri2_ali_si284
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+    4000 42000 \
+    data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
+fi
+
+if [ $stage -le 6 ]; then
+  utils/mkgraph.sh data/lang_nosp_test \
+    exp/wsj_tri3/{,graph_nosp_test}
+
+  for dset in eval97.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/wsj_tri3/graph_nosp_test data/$dset \
+      exp/wsj_tri3/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/wsj_tri3/decode_nosp_${dset} \
+      exp/wsj_tri3/decode_nosp_${dset}_rescore
+  done
+fi
+
+###############################################################################
+# Segment long HUB4 recordings and retrieve transcript using 
+# Smith-Waterman alignment.
+# Use a SAT model trained on train_si284 (wsj_tri3) as seed model for decoding.
+###############################################################################
+
+if [ $stage -le 7 ]; then
+  steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+    --stage $segment_stage --nj $reco_nj \
+    --max-bad-proportion 0.5 --align-full-hyp false \
+    exp/wsj_tri3 data/lang_nosp data/train_long \
+    data/train_reseg_${stage1_affix} exp/segment_long_utts_${stage1_affix}_train
+fi
+
+if [ $stage -le 8 ]; then
+  steps/compute_cmvn_stats.sh data/train_reseg_${stage1_affix} \
+    exp/make_mfcc/train_reseg_${stage1_affix} mfcc
+  utils/fix_data_dir.sh data/train_reseg_${stage1_affix}
+
+  utils/data/modify_speaker_info.sh data/train_reseg_${stage1_affix} \
+    data/train_reseg_${stage1_affix}_spk30sec
+  steps/compute_cmvn_stats.sh data/train_reseg_${stage1_affix}_spk30sec \
+    exp/make_mfcc/train_reseg_${stage1_affix}_spk30sec mfcc
+  utils/fix_data_dir.sh data/train_reseg_${stage1_affix}_spk30sec
+fi
+
+###############################################################################
+# Train new in-domain GMM (tri4_a) on retrieved transcripts.
+###############################################################################
+
+if [ $stage -le 9 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp \
+    exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${stage1_affix}
+
+  steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp \
+    exp/wsj_tri3_ali_train_reseg_${stage1_affix} exp/tri3_${stage1_affix} 
+fi
+
+if [ $stage -le 10 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp exp/tri3_${stage1_affix} exp/tri3_${stage1_affix}_ali
+
+  steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp exp/tri3_${stage1_affix}_ali exp/tri4_${stage1_affix}
+fi
+
+if [ $stage -le 11 ]; then
+  utils/mkgraph.sh data/lang_nosp_test exp/tri4_${stage1_affix}/{,graph_nosp_test}
+  for dset in eval97.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/tri4_${stage1_affix}/graph_nosp_test data/$dset exp/tri4_${stage1_affix}/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/tri4_${stage1_affix}/decode_nosp_${dset} \
+      exp/tri4_${stage1_affix}/decode_nosp_${dset}_rescore
+  done
+fi
+
+###############################################################################
+# Segment long HUB4 recordings and retrieve transcript using 
+# Smith-Waterman alignment.
+# Use in-domain SAT model (tri4_a) as seed model for decoding.
+###############################################################################
+
+if [ $stage -le 12 ]; then
+  steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+    --stage $segment_stage --nj $reco_nj \
+    --max-bad-proportion 0.5 --align-full-hyp false \
+    exp/tri4_${stage1_affix} data/lang_nosp data/train_long \
+    data/train_reseg_${stage2_affix} exp/segment_long_utts_${stage2_affix}_train
+fi
+
+if [ $stage -le 13 ]; then
+  steps/compute_cmvn_stats.sh data/train_reseg_${stage2_affix} \
+    exp/make_mfcc/train_reseg_${stage2_affix} mfcc
+  utils/fix_data_dir.sh data/train_reseg_${stage2_affix}
+fi
+
+###############################################################################
+# Train new in-domain GMM (tri4_2a) on retrieved transcripts.
+###############################################################################
+
+if [ $stage -le 14 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train_reseg_${stage2_affix} data/lang_nosp \
+    exp/tri4_${stage1_affix} exp/tri4_${stage1_affix}_ali_train_reseg_${stage2_affix}
+
+  steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+    data/train_reseg_${stage2_affix} data/lang_nosp \
+    exp/tri4_${stage1_affix}_ali_train_reseg_${stage2_affix} exp/tri4_${stage2_affix} 
+fi
+
+if [ $stage -le 15 ]; then
+  utils/mkgraph.sh data/lang_nosp_test exp/tri4_${stage2_affix}/{,graph_nosp_test}
+  for dset in eval97.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/tri4_${stage2_affix}/graph_nosp_test data/$dset exp/tri4_${stage2_affix}/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/tri4_${stage2_affix}/decode_nosp_${dset} \
+      exp/tri4_${stage2_affix}/decode_nosp_${dset}_rescore
+  done
+fi
+
+###############################################################################
+# Cleanup transcripts
+# Use in-domain SAT model (tri4_2a) as seed model for decoding.
+###############################################################################
+
+cleanup_stage=-1
+cleanup_affix=cleaned
+srcdir=exp/tri4_${stage2_affix}
+cleaned_data=data/train_reseg_${stage2_affix}_${cleanup_affix}
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+if [ $stage -le 16 ]; then
+  steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
+    --cmd "$train_cmd" \
+    data/train_reseg_${stage2_affix} data/lang_nosp \
+    $srcdir $dir $cleaned_data
+fi
+
+###############################################################################
+# Train new in-domain GMM (tri4_2a) on cleaned-up transcripts.
+###############################################################################
+
+if [ $stage -le 17 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+    5000 100000 $cleaned_data data/lang_nosp \
+    ${srcdir}_ali_${cleanup_affix} exp/tri5_${stage2_affix}_${cleanup_affix}
+fi
+
+if [ $stage -le 18 ]; then
+  utils/mkgraph.sh data/lang_nosp_test \
+    exp/tri5_${stage2_affix}_${cleanup_affix}/{,graph_nosp_test}
+  for dset in eval97.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/tri5_${stage2_affix}_${cleanup_affix}/graph_nosp_test data/$dset \
+      exp/tri5_${stage2_affix}_${cleanup_affix}/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/tri5_${stage2_affix}_${cleanup_affix}/decode_nosp_${dset} \
+      exp/tri5_${stage2_affix}_${cleanup_affix}/decode_nosp_${dset}_rescore
+  done
+fi
+
+exit 0
diff --git a/egs/hub4_english/s5/local/score.sh b/egs/hub4_english/s5/local/score.sh
new file mode 120000
index 00000000000..d89286dc25a
--- /dev/null
+++ b/egs/hub4_english/s5/local/score.sh
@@ -0,0 +1 @@
+score_sclite.sh
\ No newline at end of file
diff --git a/egs/hub4_english/s5/local/score_sclite.sh b/egs/hub4_english/s5/local/score_sclite.sh
new file mode 100755
index 00000000000..add014c2dcc
--- /dev/null
+++ b/egs/hub4_english/s5/local/score_sclite.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+min_lmwt=5
+max_lmwt=17
+iter=final
+word_ins_penalty=0.0,0.5,1.0
+resolve_ctm_overlaps=false
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score_sclite.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+dir=$3
+
+model=$dir/../$iter.mdl # assume model one level up from decoding dir.
+
+hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
+[ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
+hubdir=`dirname $hubscr`
+
+for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \
+     $model $data/segments $data/reco2file_and_channel $dir/lat.1.gz; do
+  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+done
+
+if [ -f $dir/../frame_shift ]; then
+  frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)"
+  echo "$0: $dir/../frame_shift exists, using $frame_shift_opt"
+elif [ -f $dir/../frame_subsampling_factor ]; then
+  factor=$(cat $dir/../frame_subsampling_factor) || exit 1
+  frame_shift_opt="--frame-shift=0.0$factor"
+  echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
+fi
+
+name=`basename $data`; # e.g. eval2000
+
+mkdir -p $dir/scoring/log
+
+if [ $stage -le 0 ]; then
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \
+      mkdir -p $dir/score_LMWT_${wip}/ '&&' \
+      lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+      lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+      lattice-1best ark:- ark:- \| \
+      lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
+      nbest-to-ctm $frame_shift_opt ark:- - \| \
+      utils/int2sym.pl -f 5 $lang/words.txt '>' \
+      $dir/score_LMWT_${wip}/$name.utt_ctm || exit 1;
+  done
+fi
+
+if [ $stage -le 1 ]; then
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    if $resolve_ctm_overlaps; then
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/resolve_ctm_overlaps.LMWT.${wip}.log \
+        utils/ctm/resolve_ctm_overlaps.py $data/segments \
+          $dir/score_LMWT_${wip}/$name.utt_ctm - \| \
+        utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
+        '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
+    else
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/convert_ctm.LMWT.${wip}.log \
+        cat $dir/score_LMWT_${wip}/$name.utt_ctm \| \
+        utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
+        '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
+    fi
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # Remove some stuff we don't want to score, from the ctm.
+  # the big expression in parentheses contains all the things that get mapped
+  # by the glm file, into hesitations.
+  # The -$ expression removes partial words.
+  # the aim here is to remove all the things that appear in the reference as optionally
+  # deletable (inside parentheses), as if we delete these there is no loss, while
+  # if we get them correct there is no gain.
+  for x in $dir/score_*/$name.ctm; do
+    cp $x $dir/tmpf;
+    cat $dir/tmpf | grep -i -v -E '<NOISE|SPOKEN_NOISE>' | \
+    grep -i -v -E ' (UH|UM|EH|MM|HM|AH|HUH|HA|ER|OOF|HEE|ACH|EEE|EW)$' | \
+    grep -v -- '-$' > $x;
+  done
+fi
+
+# Score the set...
+if [ $stage -le 3 ]; then
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.${wip}.log \
+      cp $data/stm $dir/score_LMWT_${wip}/ '&&' \
+      $hubscr -p $hubdir -V -l english -h hub4 -g $data/glm -r $dir/score_LMWT_${wip}/stm $dir/score_LMWT_${wip}/${name}.ctm || exit 1;
+  done
+fi
+
+exit 0
diff --git a/egs/hub4_english/s5/local/train_lm.sh b/egs/hub4_english/s5/local/train_lm.sh
new file mode 100755
index 00000000000..4378a287d42
--- /dev/null
+++ b/egs/hub4_english/s5/local/train_lm.sh
@@ -0,0 +1,234 @@
+#!/bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0
+#
+# This script trains a LM on the Broadcast News transcripts.
+# It is based on the example scripts distributed with PocoLM.
+
+# It will first check if pocolm is installed and if not will process with installation
+
+
+set -e
+set -o pipefail 
+set -u
+
+stage=0
+dir=data/local/local_lm
+cmd=run.pl
+vocab_size=   # Preferred vocabulary size
+
+echo "$0 $@"  # Print the command line for logging
+. utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+num_dev_sentences=4500
+RANDOM=0  # set seed for shuffling to ensure reproducibility
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # Take unique subset to make sure that the training text is not in the 
+  # dev set.
+  # Replace train with train_bn96 in order to use only the 1996 HUB4 set
+  cat data/train/text | cut -d ' ' -f 2- | sort | uniq -c | \
+    shuf > ${dir}/train_text_with_count
+  head -n $num_dev_sentences < ${dir}/train_text_with_count | \
+    awk '{for (i=0; i<$1; i++) {print $0;} }' | cut -d ' ' -f 2- > \
+    ${dir}/data/text/dev.txt 
+  tail -n +$[num_dev_sentences+1] < ${dir}/train_text_with_count | \
+    awk '{for (i=0; i<$1; i++) {print $0;} }' | cut -d ' ' -f 2- > \
+    ${dir}/data/text/train.txt
+
+  # Get text from NA News corpus 
+  for x in data/local/data/na_news/*; do
+    y=`basename $x`
+    [ -f $x/corpus.gz ] && ln -sf `readlink -f $x/corpus.gz` ${dir}/data/text/${y}.txt.gz
+  done
+
+  # Get text from 1996 CSR HUB4 LM corpus
+  for x in `cat data/local/data/csr96_hub4/{train,test}.filelist`; do
+    gunzip -c $x
+  done | gzip -c > ${dir}/data/text/csr96_hub4.txt.gz
+  
+  # Get text from 1995 CSR-IV HUB4 corpus
+  cat data/local/data/csr95_hub4/dev95_text \
+    data/local/data/csr95_hub4/eval95_text \
+    data/local/data/csr95_hub4/train95_text | cut -d ' ' -f 2- > \
+    ${dir}/data/text/csr95_hub4.txt
+
+  # Get text from NA News supplement corpus 
+  for x in data/local/data/na_news_supp; do
+    y=`basename $x`
+    [ -f $x/corpus.gz ] && ln -sf `readlink -f $x/corpus.gz` ${dir}/data/text/${y}.txt.gz
+  done
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  for x in dev96pe dev96ue eval96 eval97 eval98 eval99_1 eval99_2; do
+    cat data/$x/stm | awk '!/^;;/ {if (NF > 6) print $0}' | cut -d ' ' -f 1,7- | \
+      awk '!/IGNORE_TIME_SEGMENT_IN_SCORING/ {print $0}' | \
+      local/normalize_transcripts.pl "<NOISE>" "<SPOKEN_NOISE>" | \
+      cut -d ' ' -f 2- > ${dir}/data/${x}.txt
+  done
+fi
+
+if [ $stage -le 1 ]; then
+  mkdir -p $dir/data/work
+  if [ ! -f $dir/data/work/word_counts/.done ]; then
+    get_word_counts.py $dir/data/text $dir/data/work/word_counts
+    touch $dir/data/work/word_counts/.done
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # decide on the vocabulary.
+
+  # NA news corpus is not clean. So better not to get vocabulary from there.
+  # for x in data/local/data/na_news/*; do
+  #   y=$dir/data/work/word_counts/`basename $x`.counts
+  #   [ -f $y ] && cat $y 
+  # done | local/lm/merge_word_counts.py 15 > $dir/data/work/na_news.wordlist_counts
+
+  cat $dir/data/work/word_counts/{train,dev}.counts | \
+    local/lm/merge_word_counts.py 2 > $dir/data/work/train.wordlist_counts
+
+  cat $dir/data/work/word_counts/csr96_hub4.counts | \
+    local/lm/merge_word_counts.py 5 > $dir/data/work/csr96_hub4.wordlist_counts
+
+  cat $dir/data/work/word_counts/csr95_hub4.counts | \
+    local/lm/merge_word_counts.py 5 > $dir/data/work/csr95_hub4.wordlist_counts
+
+  cat $dir/data/work/{train,csr96_hub4,csr95_hub4}.wordlist_counts | \
+    perl -ane 'if ($F[1] =~ m/[A-Za-z]/) { print "$F[0] $F[1]\n"; }' | \
+    local/lm/merge_word_counts.py 1 | sort -k 1,1nr > $dir/data/work/final.wordlist_counts
+
+  if [ ! -z "$vocab_size" ]; then
+    awk -v sz=$vocab_size 'BEGIN{count=-1;} 
+    { i+=1; if (i == int(sz)) { count = $1; };
+      if (count > 0 && count != $1) { exit(0); } 
+      print $0;
+    }' $dir/data/work/final.wordlist_counts
+  else 
+    cat $dir/data/work/final.wordlist_counts
+  fi | awk '{print $2}' > $dir/data/work/wordlist
+fi
+
+order=4
+wordlist=$dir/data/work/wordlist
+
+min_counts='default=5 train=1 csr96_hub4=2,3 csr95_hub4=2,3'
+
+lm_name="`basename ${wordlist}`_${order}"
+if [ -n "${min_counts}" ]; then
+  lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "," "." | tr "=" "-"`"
+fi
+unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+
+if [ $stage -le 3 ]; then
+  echo "$0: training the unpruned LM"
+
+  $cmd ${unpruned_lm_dir}/log/train.log \
+    train_lm.py  --wordlist=$wordlist --num-splits=10 --warm-start-ratio=20  \
+                 --limit-unk-history=true \
+                 --fold-dev-into=train \
+                 --min-counts="${min_counts}" \
+                 ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do
+    $cmd ${unpruned_lm_dir}/log/compute_data_prob_${x}.log \
+      get_data_prob.py ${dir}/data/${x}.txt ${unpruned_lm_dir} 
+
+    cat ${unpruned_lm_dir}/log/compute_data_prob_${x}.log | grep -F '[perplexity'
+  done
+  
+  # train_lm.py: You can set --bypass-metaparameter-optimization='0.829,0.997,0.066,0.014,0.171,0.244,0.063,0.001,0.023,0.004,0.014,0.006,0.018,0.027,0.082,1.000,0.004,0.007,0.024,0.703,0.108,0.046,0.019,0.848,0.258,0.208,0.195,0.889,0.297,0.282,0.242' to get equivalent results
+  # train_lm.py: Ngram counts: 98768 + 26286404 + 21077207 + 17945418 = 65407797
+  
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.88365261291 per word [perplexity = 132.112338899] over 18771.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.9299451353 per word [perplexity = 138.371920398] over 23710.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.8308081807 per word [perplexity = 125.312194639] over 20553.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.82377287988 per word [perplexity = 124.433679586] over 33234.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.88114977878 per word [perplexity = 131.782097071] over 33180.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -5.01175279868 per word [perplexity = 150.167719384] over 11529.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -5.01485733132 per word [perplexity = 150.634644387] over 16395.0 words.
+  
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: pruning the LM (to larger size)"
+  # Using 10 million n-grams for a big LM for rescoring purposes.
+  size=10000000
+  $cmd ${dir}/data/lm_${order}_prune_big/log/prune_lm.log \
+    prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 \
+    ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+
+  for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do
+    $cmd ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}.log \
+      get_data_prob.py ${dir}/data/${x}.txt ${dir}/data/lm_${order}_prune_big
+
+    cat ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}.log | grep -F '[perplexity'
+  done
+
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/lm_4_prune_big was -4.96695051249 per word [perplexity = 143.588348177] over 18771.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/lm_4_prune_big was -5.01232680304 per word [perplexity = 150.253941052] over 23710.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/lm_4_prune_big was -4.91227395027 per word [perplexity = 135.948202644] over 20553.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/lm_4_prune_big was -4.92411302883 per word [perplexity = 137.567269311] over 33234.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/lm_4_prune_big was -4.97443821579 per word [perplexity = 144.667530381] over 33180.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/lm_4_prune_big was -5.10483206523 per word [perplexity = 164.816389804] over 11529.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/lm_4_prune_big was -5.10905926136 per word [perplexity = 165.514575655] over 16395.0 words.
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: pruning the LM (to smaller size)"
+  # Using 2 million n-grams for a smaller LM for graph building.  Prune from the
+  # bigger-pruned LM, it'll be faster.
+  size=2000000
+  
+  $cmd ${dir}/data/lm_${order}_prune_small/log/prune_lm.log \
+    prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big \
+    ${dir}/data/lm_${order}_prune_small
+
+  for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do
+    $cmd ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}.log \
+      get_data_prob.py ${dir}/data/${x}.txt ${dir}/data/lm_${order}_prune_small
+
+    cat ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}.log | grep -F '[perplexity'
+  done
+
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/lm_4_prune_small was -5.12459372596 per word [perplexity = 168.105830741] over 18771.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/lm_4_prune_small was -5.16866547448 per word [perplexity = 175.680231224] over 23710.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/lm_4_prune_small was -5.08096906048 per word [perplexity = 160.929931226] over 20553.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/lm_4_prune_small was -5.09222677679 per word [perplexity = 162.751870937] over 33234.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/lm_4_prune_small was -5.12842796263 per word [perplexity = 168.751625556] over 33180.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/lm_4_prune_small was -5.26755997571 per word [perplexity = 193.942161054] over 11529.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/lm_4_prune_small was -5.27092234584 per word [perplexity = 194.595363921] over 16395.0 words
+
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
+fi
diff --git a/egs/hub4_english/s5/path.sh b/egs/hub4_english/s5/path.sh
new file mode 100755
index 00000000000..49813fc4cd0
--- /dev/null
+++ b/egs/hub4_english/s5/path.sh
@@ -0,0 +1,7 @@
+export KALDI_ROOT=../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+if [ -f $KALDI_ROOT/tools/env.sh ]; then . $KALDI_ROOT/tools/env.sh; fi
+export PATH=$KALDI_ROOT/tools/sctk/bin:$PATH
+export LC_ALL=C
diff --git a/egs/hub4_english/s5/run.sh b/egs/hub4_english/s5/run.sh
new file mode 100755
index 00000000000..5db61d4eb10
--- /dev/null
+++ b/egs/hub4_english/s5/run.sh
@@ -0,0 +1,269 @@
+#!/bin/bash
+
+# Copyright 2016   Vimal Manohar
+# Apache 2.0.
+
+# See README.txt for more info on data required.
+
+. ./cmd.sh
+. ./path.sh
+
+set -o pipefail
+set -e
+
+mfccdir=`pwd`/mfcc
+nj=40
+stage=-1
+
+. utils/parse_options.sh
+
+# Training corpora
+
+# 1996 English Broadcast News Train (HUB4)
+hub4_96_train_transcripts=/export/corpora/LDC/LDC97T22/hub4_eng_train_trans
+hub4_96_train_speech=/export/corpora/LDC/LDC97S44/data
+# 1997 English Broadcast News Train (HUB4)
+hub4_97_train_transcripts=/export/corpora/LDC/LDC98T28/hub4e97_trans_980217
+hub4_97_train_speech=/export/corpora/LDC/LDC98S71/97_eng_bns_hub4
+# 1996 CSR HUB4 Language Model
+csr_hub4_lm=/export/corpora/LDC/LDC98T31/1996_csr_hub4_model
+# 1995 CSR-IV HUB4 corpus
+csr95_hub4=/export/corpora/LDC/LDC96S31/csr95_hub4
+# North American News Text Corpus
+NA_text=/export/corpora/LDC/LDC95T21
+# North American News Text Supplement Corpus
+NA_text_supp=/export/corpora/LDC/LDC98T30/northam_news_txt_sup
+
+# Test corpora
+
+# 1996 English Broadcast News Dev and Eval (HUB4)
+hub4_96_eval=/export/corpora/LDC/LDC97S66/1996_eng_bcast_dev_eval
+# 1997 HUB4 English Evaluation corpus
+hub4_97_eval=/export/corpora/LDC/LDC2002S11/hub4e_97
+# 1998 HUB4 Broadcast News Evaluation English Test Material
+hub4_98_eval=/export/corpora/LDC/LDC2000S86
+# 1999 HUB4 Broadcast News Evaluation English Test Material
+hub4_99_eval=/export/corpora5/LDC/LDC2000S88/hub4_1999
+
+# Test sets used -- Uncomment and keep only test sets needed
+test_sets="eval97.pem"
+# test_sets="dev96ue dev96pe eval96 eval96.pem eval97 eval97.pem eval98 eval98.pem eval99_1 eval99_1.pem eval99_2 eval99_2.pem"
+
+if [ $stage -le 0 ]; then
+  # Prepare 1996 English Broadcast News Train (HUB4)
+  local/data_prep/prepare_1996_bn_data.sh \
+    $hub4_96_train_transcripts \
+    $hub4_96_train_speech \
+    data/local/data/train_bn96
+
+  # Prepare 1997 English Broadcast News Train (HUB4)
+  local/data_prep/prepare_1997_bn_data.sh \
+    $hub4_97_train_transcripts \
+    $hub4_97_train_speech \
+    data/local/data/train_bn97
+fi
+
+# Install Beautiful Soup 4 python package for parsing SGML-like files
+# in CSR-IV HUB4 corpus
+if [ ! -d tools/beautifulsoup4 ]; then
+  mkdir -p tools
+  pip install -t tools/beautifulsoup4 beautifulsoup4
+fi
+export PYTHONPATH=$PWD/tools/beautifulsoup4:$PYTHONPATH
+
+if [ $stage -le 1 ]; then
+  if [ ! -f $csr_hub4_lm/utils.tar ]; then
+    echo "Expected CSR-IV utils.tar to be found"
+    exit 1
+  fi
+
+  mkdir -p tools/csr4_utils
+  (
+    cd tools/csr4_utils
+    tar -xvf $csr_hub4_lm/utils.tar
+  )
+
+  chmod a+w tools/csr4_utils
+  patch -u -d tools/csr4_utils -p3 < local/data_prep/csr4_utils.patch
+fi
+
+if [ $stage -le 2 ]; then
+  # Prepare 1995 CSR-IV HUB4 corpus
+  local/data_prep/prepare_1995_csr_hub4_corpus.sh \
+    $csr95_hub4 data/local/data/csr95_hub4
+fi
+
+if [ $stage -le 3 ]; then
+  # Prepare North American News Text Corpus
+  local/data_prep/prepare_na_news_text_corpus.sh --nj 40 --cmd "$train_cmd" \
+     $NA_text data/local/data/na_news
+
+  # Prepare North American News Text Supplement Corpus
+  local/data_prep/prepare_na_news_text_supplement.sh --nj 10 --cmd "$train_cmd" \
+    $NA_text_supp data/local/data/na_news_supp
+fi
+
+if [ $stage -le 4 ]; then
+  # Prepare 1996 CSR HUB4 Language Model
+  local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh --nj 10 --cmd "$train_cmd" \
+     $csr_hub4_lm data/local/data/csr96_hub4
+fi
+
+if [ $stage -le 5 ]; then
+  # Prepare 1996 English Broadcast News Dev and Eval (HUB4)
+  local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh \
+    $hub4_96_eval \
+    data/local/data/hub4_96_dev_eval
+
+  # Prepare 1997 HUB4 English Evaluation corpus
+  local/data_prep/prepare_1997_hub4_bn_eng_eval.sh \
+    $hub4_97_eval data/local/data/eval97
+
+  # Prepare 1998 HUB4 Broadcast News Evaluation English Test Material
+  local/data_prep/prepare_1998_hub4_bn_eng_eval.sh \
+    $hub4_98_eval data/local/data/eval98
+
+  # Prepare 1999 HUB4 Broadcast News Evaluation English Test Material
+  local/data_prep/prepare_1999_hub4_bn_eng_eval.sh \
+    $hub4_99_eval data/local/data/eval99
+fi
+
+if [ $stage -le 6 ]; then
+  local/format_data.sh
+fi
+
+if [ $stage -le 7 ]; then
+  local/train_lm.sh
+fi
+
+if [ $stage -le 8 ]; then
+  local/prepare_dict.sh --dict-suffix "_nosp" \
+    data/local/local_lm/data/work/wordlist
+
+  utils/prepare_lang.sh data/local/dict_nosp \
+    "<unk>" data/local/lang_tmp_nosp data/lang_nosp
+fi
+
+if [ $stage -le 9 ]; then
+  local/format_lms.sh --local-lm-dir data/local/local_lm
+fi
+
+if [ $stage -le 10 ]; then
+  for x in train $test_sets; do
+    this_nj=$(cat data/$x/utt2spk | wc -l)
+    if [ $this_nj -gt 30 ]; then
+      this_nj=30
+    fi
+
+    steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj $this_nj \
+      --cmd "$train_cmd" \
+      data/$x exp/make_mfcc $mfccdir
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc $mfccdir
+    utils/fix_data_dir.sh data/$x
+  done
+fi
+
+if [ $stage -le 15 ]; then
+  utils/subset_data_dir.sh --shortest data/train 1000 data/train_1kshort
+  utils/subset_data_dir.sh data/train 2000 data/train_2k
+
+  # Note: the --boost-silence option should probably be omitted by default
+  # for normal setups.  It doesn't always help. [it's to discourage non-silence
+  # models from modeling silence.]
+  steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+    data/train_1kshort data/lang_nosp exp/mono0a
+fi
+
+if [ $stage -le 16 ]; then
+  steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+    data/train_2k data/lang_nosp exp/mono0a exp/mono0a_ali
+
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 \
+    data/train_2k data/lang_nosp exp/mono0a_ali exp/tri1
+fi
+
+if [ $stage -le 17 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp exp/tri1 exp/tri1_ali
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" 2500 15000 \
+    data/train data/lang_nosp exp/tri1_ali exp/tri2
+fi
+
+if [ $stage -le 18 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp exp/tri2 exp/tri2_ali
+
+  steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+    data/train data/lang_nosp exp/tri2_ali exp/tri3
+fi
+
+if [ $stage -le 19 ]; then
+  utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
+
+  for dset in $test_sets; do
+    (
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/tri3/graph_nosp data/$dset exp/tri3/decode_nosp_${dset} || touch exp/tri3/.error
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/tri3/decode_nosp_${dset} \
+      exp/tri3/decode_nosp_${dset}_rescore || touch exp/tri3/.error
+    ) &
+  done
+  wait
+
+  if [ -f exp/tri3/.error ]; then
+    echo "Decode failed in exp/tri3/decode*"
+    exit 1
+  fi
+fi
+
+if [ $stage -le 20 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp exp/tri3 exp/tri3_ali
+
+  steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+    data/train data/lang_nosp exp/tri3_ali exp/tri4
+fi
+
+if [ $stage -le 21 ]; then
+  utils/mkgraph.sh data/lang_nosp_test exp/tri4 exp/tri4/graph_nosp
+
+  for dset in $test_sets; do
+    (
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/tri4/graph_nosp data/$dset exp/tri4/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/tri4/decode_nosp_${dset} \
+      exp/tri4/decode_nosp_${dset}_rescore
+    ) &
+  done
+  wait
+
+  if [ -f exp/tri4/.error ]; then
+    echo "Decode failed in exp/tri4/decode*"
+    exit 1
+  fi
+fi
+
+wait
+
+# %WER 18.0 | 728 32834 | 83.9 11.7 4.3 2.0 18.0 85.9 | exp/tri4/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
+# %WER 19.3 | 728 32834 | 82.9 12.6 4.6 2.2 19.3 86.8 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+
+# The following demonstrates how to use out-of-domain WSJ models to segment long
+# audio recordings of HUB4 with raw unaligned transcripts into short segments
+# with aligned transcripts for training new ASR models.
+
+# local/run_segmentation_wsj.sh
+exit 0
diff --git a/egs/hub4_english/s5/steps b/egs/hub4_english/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/hub4_english/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/hub4_english/s5/utils b/egs/hub4_english/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/hub4_english/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py b/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
index e940183497b..848ca61ebe4 100755
--- a/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
+++ b/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
@@ -488,20 +488,24 @@ def ctm_line_to_string(ctm_line):
     return " ".join([str(x) for x in ctm_line])
 
 
-def test_alignment():
-    hyp = "ACACACTA"
+def test_alignment(align_full_hyp):
+    hyp = "GCCAT"
     ref = "AGCACACA"
 
+    verbose = 3
+    logger.info("REF: %s", ref)
+    logger.info("HYP: %s", hyp)
+
     output, score = smith_waterman_alignment(
         ref, hyp, similarity_score_function=lambda x, y: 2 if (x == y) else -1,
-        del_score=-1, ins_score=-1, eps_symbol="-", align_full_hyp=True)
+        del_score=-1, ins_score=-1, eps_symbol="-", align_full_hyp=align_full_hyp)
 
     print_alignment("Alignment", output, out_file_handle=sys.stderr)
 
 
 def run(args):
     if args.debug_only:
-        test_alignment()
+        test_alignment(args.align_full_hyp)
         raise SystemExit("Exiting since --debug-only was true")
 
     def similarity_score_function(x, y):
diff --git a/egs/wsj/s5/steps/cleanup/internal/get_ctm.sh b/egs/wsj/s5/steps/cleanup/internal/get_ctm.sh
deleted file mode 100755
index 35c0a4bd3a8..00000000000
--- a/egs/wsj/s5/steps/cleanup/internal/get_ctm.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/bash
-# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
-# Copyright 2017  Vimal Manohar
-
-# This script produces CTM files from a decoding directory that has lattices
-# present.
-# This is similar to get_ctm.sh, but gets the
-# CTM at the utterance-level.
-
-
-# begin configuration section.
-cmd=run.pl
-stage=0
-frame_shift=0.01
-lmwt=10
-print_silence=false
-#end configuration section.
-
-echo "$0 $@"  # Print the command line for logging
-
-[ -f ./path.sh ] && . ./path.sh
-. parse_options.sh || exit 1;
-
-if [ $# -ne 3 ]; then
-  echo "Usage: $0 [options] <data-dir> <lang-dir|graph-dir> <decode-dir>"
-  echo " Options:"
-  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
-  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
-  echo "    --frame-shift (default=0.01)    # specify this if your lattices have a frame-shift"
-  echo "                                    # not equal to 0.01 seconds"
-  echo "e.g.:"
-  echo "$0 data/train data/lang exp/tri4a/decode/"
-  echo "See also: steps/get_train_ctm.sh"
-  exit 1;
-fi
-
-data=$1
-lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
-dir=$3
-
-if [ -f $dir/final.mdl ]; then
-  model=$dir/final.mdl
-else
-  model=$dir/../final.mdl # assume model one level up from decoding dir.
-fi
-
-for f in $lang/words.txt $model $dir/lat.1.gz; do
-  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
-done
-
-name=`basename $data`; # e.g. eval2000
-
-mkdir -p $dir/scoring/log
-
-if [ $stage -le 0 ]; then
-  nj=$(cat $dir/num_jobs)
-  if [ -f $lang/phones/word_boundary.int ]; then
-    $cmd JOB=1:$nj $dir/scoring/log/get_ctm.JOB.log \
-      set -o pipefail '&&' mkdir -p $dir/score_$lmwt/ '&&' \
-      lattice-1best --lm-scale=$lmwt "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
-      lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
-      nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
-      utils/int2sym.pl -f 5 $lang/words.txt \
-      '>' $dir/score_$lmwt/${name}.ctm.JOB || exit 1;
-  elif [ -f $lang/phones/align_lexicon.int ]; then
-    $cmd JOB=1:$nj $dir/scoring/log/get_ctm.JOB.log \
-      set -o pipefail '&&' mkdir -p $dir/score_$lmwt/ '&&' \
-      lattice-1best --lm-scale=$lmwt "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
-      lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
-      lattice-1best ark:- ark:- \| \
-      nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
-      utils/int2sym.pl -f 5 $lang/words.txt \
-      '>' $dir/score_${lmwt}/${name}.ctm.JOB || exit 1;
-  else
-    echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
-    exit 1;
-  fi
-fi
diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
index 7d83fbb29f8..16350fdb032 100755
--- a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
+++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
@@ -6,7 +6,7 @@
 
 . ./path.sh
 
-set -e 
+set -e
 set -o pipefail
 set -u
 
@@ -24,13 +24,13 @@ lmwt=10
 
 # TF-IDF similarity search options
 max_words=1000
-num_neighbors_to_search=1   # Number of neighboring documents to search around the one retrieved based on maximum tf-idf similarity. 
-neighbor_tfidf_threshold=0.5   
+num_neighbors_to_search=1   # Number of neighboring documents to search around the one retrieved based on maximum tf-idf similarity.
+neighbor_tfidf_threshold=0.5
 
 align_full_hyp=false  # Align full hypothesis i.e. trackback from the end to get the alignment.
 
 # First-pass segmentation opts
-# These options are passed to the script 
+# These options are passed to the script
 # steps/cleanup/internal/segment_ctm_edits_mild.py
 segmentation_extra_opts=
 min_split_point_duration=0.1
@@ -56,11 +56,11 @@ Usage: $0 [options] <model-dir> <lang> <data-in> [<text-in> <utt2text>] <segment
  e.g.: $0 exp/wsj_tri2b data/lang_nosp data/train_long data/train_long/text data/train_reseg exp/segment_wsj_long_utts_train
 This script performs segmentation of the data in <data-in> and writes out the
 segmented data (with a segments file) to
-<segmented-data-out> along with the corresponding aligned transcription.  
-Note: If <utt2text> is not provided, the "text" file in <data-in> is used as the 
+<segmented-data-out> along with the corresponding aligned transcription.
+Note: If <utt2text> is not provided, the "text" file in <data-in> is used as the
 raw transcripts to train biased LM for the utterances.
-If <utt2text> is provided, then it should be a mapping from the utterance-ids in 
-<data-in> to the transcript-keys in the file <text-in>, which will be 
+If <utt2text> is provided, then it should be a mapping from the utterance-ids in
+<data-in> to the transcript-keys in the file <text-in>, which will be
 used to train biased LMs for the utterances.
 The purpose of this script is to divide up the input data (which may consist of
 long recordings such as television shows or audiobooks) into segments which are
@@ -86,7 +86,7 @@ if [ $# -eq 7 ]; then
   out_data=$6
   dir=$7
   extra_files="$utt2text"
-else 
+else
   out_data=$4
   dir=$5
 fi
@@ -103,12 +103,12 @@ data_id=`basename $data`
 mkdir -p $dir
 
 data_uniform_seg=$dir/${data_id}_uniform_seg
-  
+
 frame_shift=`utils/data/get_frame_shift.sh $data`
 
-# First we split the data into segments of around 30s long, on which 
-# it would be possible to do a decoding. 
-# A diarization step will be added in the future. 
+# First we split the data into segments of around 30s long, on which
+# it would be possible to do a decoding.
+# A diarization step will be added in the future.
 if [ $stage -le 1 ]; then
   echo "$0: Stage 1 (Splitting data directory $data into uniform segments)"
 
@@ -133,12 +133,12 @@ if [ $stage -le 2 ]; then
       $data $dir/uniform_sub_segments $dir/${data_id}_uniform_seg.temp
 
     utils/data/modify_speaker_info.sh --seconds-per-spk-max $seconds_per_spk_max \
-      $dir/${data_id}_uniform_seg.temp $data_uniform_seg 
+      $dir/${data_id}_uniform_seg.temp $data_uniform_seg
   else
     utils/data/subsegment_data_dir.sh \
       $data $dir/uniform_sub_segments $data_uniform_seg
   fi
-  
+
   utils/fix_data_dir.sh $data_uniform_seg
 
   # Compute new cmvn stats for the segmented data directory
@@ -157,19 +157,19 @@ if [ $stage -le 3 ]; then
   cp $srcdir/phones.txt $dir 2>/dev/null || true
 
   mkdir -p $graph_dir
-  
-  # Make graphs w.r.t. to the original text (usually recording-level) 
+
+  # Make graphs w.r.t. to the original text (usually recording-level)
   steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
     --nj $nj --cmd "$cmd" $text \
     $lang $dir $dir/graphs
   if [ -z "$utt2text" ]; then
-    # and then copy it to the sub-segments. 
+    # and then copy it to the sub-segments.
     cat $dir/uniform_sub_segments | awk '{print $1" "$2}' | \
       utils/apply_map.pl -f 2 $dir/graphs/HCLG.fsts.scp | \
       sort -k1,1 > \
       $graph_dir/HCLG.fsts.scp
   else
-    # and then copy it to the sub-segments. 
+    # and then copy it to the sub-segments.
     cat $dir/uniform_sub_segments | awk '{print $1" "$2}' | \
       utils/apply_map.pl -f 2 $utt2text | \
       utils/apply_map.pl -f 2 $dir/graphs/HCLG.fsts.scp | \
@@ -187,13 +187,13 @@ mkdir -p $decode_dir
 
 if [ $stage -le 4 ]; then
   echo "$0: Decoding with biased language models..."
-  
+
   if [ -f $srcdir/trans.1 ]; then
     steps/cleanup/decode_fmllr_segmentation.sh \
       --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \
       --skip-scoring true --allow-partial false \
       $graph_dir $data_uniform_seg $decode_dir
-  else 
+  else
     steps/cleanup/decode_segmentation.sh \
       --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \
       --skip-scoring true --allow-partial false \
@@ -202,25 +202,24 @@ if [ $stage -le 4 ]; then
 fi
 
 if [ $stage -le 5 ]; then
-  steps/cleanup/internal/get_ctm.sh \
-    --lmwt $lmwt --cmd "$cmd --mem 4G" \
+  steps/get_ctm_fast.sh --lmwt $lmwt --cmd "$cmd --mem 4G" \
     --print-silence true \
-    $data_uniform_seg $lang $decode_dir
+    $data_uniform_seg $lang $decode_dir $decode_dir/ctm_$lmwt
 fi
 
-# Split the original text into documents, over which we can do 
+# Split the original text into documents, over which we can do
 # searching reasonably efficiently. Also get a mapping from the original
 # text to the created documents (i.e. text2doc)
-# Since the Smith-Waterman alignment is linear in the length of the 
-# text, we want to keep it reasonably small (a few thousand words). 
+# Since the Smith-Waterman alignment is linear in the length of the
+# text, we want to keep it reasonably small (a few thousand words).
 
 if [ $stage -le 6 ]; then
   # Split the reference text into documents.
   mkdir -p $dir/docs
-    
+
   # text2doc is a mapping from the original transcript to the documents
   # it is split into.
-  # The format is 
+  # The format is
   # <original-transcript> <doc1> <doc2> ...
   steps/cleanup/internal/split_text_into_docs.pl --max-words $max_words \
     $text $dir/docs/doc2text $dir/docs/docs.txt
@@ -230,11 +229,11 @@ fi
 if [ $stage -le 7 ]; then
   # Get TF-IDF for the reference documents.
   echo $nj > $dir/docs/num_jobs
-  
+
   utils/split_data.sh $data_uniform_seg $nj
 
   mkdir -p $dir/docs/split$nj/
-  
+
   # First compute IDF stats
   $cmd $dir/log/compute_source_idf_stats.log \
     steps/cleanup/internal/compute_tf_idf.py \
@@ -242,23 +241,23 @@ if [ $stage -le 7 ]; then
     --idf-weighting-scheme="log" \
     --output-idf-stats=$dir/docs/idf_stats.txt \
     $dir/docs/docs.txt $dir/docs/src_tf_idf.txt
-  
+
   # Split documents so that they can be accessed easily by parallel jobs.
   mkdir -p $dir/docs/split$nj/
   sdir=$dir/docs/split$nj
   for n in `seq $nj`; do
 
-    # old2new_utts is a mapping from the original segments to the 
+    # old2new_utts is a mapping from the original segments to the
     # new segments created by uniformly segmenting.
     # The format is <old-utterance> <new-utt1> <new-utt2> ...
     utils/filter_scp.pl $data_uniform_seg/split$nj/$n/utt2spk $dir/uniform_sub_segments | \
       cut -d ' ' -f 1,2 | utils/utt2spk_to_spk2utt.pl > $sdir/old2new_utts.$n.txt
 
     if [ ! -z "$utt2text" ]; then
-      # utt2text, if provided, is a mapping from the <old-utterance> to 
+      # utt2text, if provided, is a mapping from the <old-utterance> to
       # <original-transript>.
-      # Since text2doc is mapping from <original-transcript> to documents, we 
-      # first have to find the original-transcripts that are in the current 
+      # Since text2doc is mapping from <original-transcript> to documents, we
+      # first have to find the original-transcripts that are in the current
       # split.
       utils/filter_scp.pl $sdir/old2new_utts.$n.txt $utt2text | \
         cut -d ' ' -f 2 | sort -u | \
@@ -273,13 +272,13 @@ if [ $stage -le 7 ]; then
       $sdir/docs.$n.txt
   done
 
-  # Compute TF-IDF for the source documents. 
+  # Compute TF-IDF for the source documents.
   $cmd JOB=1:$nj $dir/docs/log/get_tfidf_for_source_texts.JOB.log \
     steps/cleanup/internal/compute_tf_idf.py \
       --tf-weighting-scheme="raw" \
       --idf-weighting-scheme="log" \
       --input-idf-stats=$dir/docs/idf_stats.txt \
-      $sdir/docs.JOB.txt $sdir/src_tf_idf.JOB.txt 
+      $sdir/docs.JOB.txt $sdir/src_tf_idf.JOB.txt
 
   sdir=$dir/docs/split$nj
   # Make $sdir an absolute pathname.
@@ -288,15 +287,15 @@ if [ $stage -le 7 ]; then
   for n in `seq $nj`; do
     awk -v f="$sdir/src_tf_idf.$n.txt" '{print $1" "f}' \
       $sdir/text2doc.$n
-  done | perl -ane 'BEGIN { %tfidfs = (); } 
-  { 
-    if (!defined $tfidfs{$F[0]}) { 
-      $tfidfs{$F[0]} = $F[1]; 
-    } 
-  } 
+  done | perl -ane 'BEGIN { %tfidfs = (); }
+  {
+    if (!defined $tfidfs{$F[0]}) {
+      $tfidfs{$F[0]} = $F[1];
+    }
+  }
   END {
-  while(my ($k, $v) = each %tfidfs) { 
-    print "$k $v\n"; 
+  while(my ($k, $v) = each %tfidfs) {
+    print "$k $v\n";
   } }' > $dir/docs/source2tf_idf.scp
 fi
 
@@ -317,18 +316,18 @@ if [ $stage -le 9 ]; then
   sdir=$dir/query_docs/split$nj
   mkdir -p $sdir
 
-  # Compute TF-IDF for the query documents (decode hypotheses). 
+  # Compute TF-IDF for the query documents (decode hypotheses).
   # The output is an archive of TF-IDF indexed by the query.
-  $cmd JOB=1:$nj $dir/lats/log/compute_query_tf_idf.JOB.log \
+  $cmd JOB=1:$nj $decode_dir/ctm_$lmwt/log/compute_query_tf_idf.JOB.log \
     steps/cleanup/internal/ctm_to_text.pl --non-scored-words $dir/non_scored_words.txt \
-      $dir/lats/score_$lmwt/${data_id}_uniform_seg.ctm.JOB \| \
+      $decode_dir/ctm_$lmwt/ctm.JOB \| \
     steps/cleanup/internal/compute_tf_idf.py \
       --tf-weighting-scheme="normalized" \
       --idf-weighting-scheme="log" \
       --input-idf-stats=$dir/docs/idf_stats.txt \
       --accumulate-over-docs=false \
       - $sdir/query_tf_idf.JOB.ark.txt
-  
+
   # The relevant documents can be found using TF-IDF similarity and nearby
   # documents can also be picked for the Smith-Waterman alignment stage.
 
@@ -345,15 +344,15 @@ if [ $stage -le 9 ]; then
   # The query TF-IDFs are all indexed by the utterance-id of the sub-segments.
   # The source TF-IDFs use the document-ids created by splitting the reference
   # text into documents.
-  # For each query, we need to retrieve the documents that were created from 
-  # the same original utterance that the sub-segment was from. For this, 
-  # we have to load the source TF-IDF that has those documents. This 
+  # For each query, we need to retrieve the documents that were created from
+  # the same original utterance that the sub-segment was from. For this,
+  # we have to load the source TF-IDF that has those documents. This
   # information is provided using the option --source-text-id2tf-idf-file.
-  # The output of this script is a file where the first column is the 
+  # The output of this script is a file where the first column is the
   # query-id (i.e. sub-segment-id) and the remaining columns, which is at least
   # one in number and a maxmium of (1 + 2 * num-neighbors-to-search) columns
   # is the document-ids for the retrieved documents.
-  $cmd JOB=1:$nj $dir/lats/log/retrieve_similar_docs.JOB.log \
+  $cmd JOB=1:$nj $dir/log/retrieve_similar_docs.JOB.log \
     steps/cleanup/internal/retrieve_similar_docs.py \
       --query-tfidf=$dir/query_docs/split$nj/query_tf_idf.JOB.ark.txt \
       --source-text-id2tfidf=$dir/docs/source2tf_idf.scp \
@@ -362,8 +361,8 @@ if [ $stage -le 9 ]; then
       --num-neighbors-to-search=$num_neighbors_to_search \
       --neighbor-tfidf-threshold=$neighbor_tfidf_threshold \
       --relevant-docs=$dir/query_docs/split$nj/relevant_docs.JOB.txt
-  
-  $cmd JOB=1:$nj $dir/lats/log/get_ctm_edits.JOB.log \
+
+  $cmd JOB=1:$nj $decode_dir/ctm_$lmwt/log/get_ctm_edits.JOB.log \
     steps/cleanup/internal/stitch_documents.py \
       --query2docs=$dir/query_docs/split$nj/relevant_docs.JOB.txt \
       --input-documents=$dir/docs/split$nj/docs.JOB.txt \
@@ -371,18 +370,18 @@ if [ $stage -le 9 ]; then
     steps/cleanup/internal/align_ctm_ref.py --eps-symbol='"<eps>"' \
       --oov-word="'`cat $lang/oov.txt`'" --symbol-table=$lang/words.txt \
       --hyp-format=CTM --align-full-hyp=$align_full_hyp \
-      --hyp=$dir/lats/score_$lmwt/${data_id}_uniform_seg.ctm.JOB --ref=- \
-      --output=$dir/lats/score_$lmwt/${data_id}_uniform_seg.ctm_edits.JOB 
-  
+      --hyp=$decode_dir/ctm_$lmwt/ctm.JOB --ref=- \
+      --output=$decode_dir/ctm_$lmwt/ctm_edits.JOB
+
   for n in `seq $nj`; do
-    cat $dir/lats/score_$lmwt/${data_id}_uniform_seg.ctm_edits.$n 
-  done > $dir/lats/score_$lmwt/ctm_edits
-  
+    cat $decode_dir/ctm_$lmwt/ctm_edits.$n
+  done > $decode_dir/ctm_$lmwt/ctm_edits
+
 fi
 
 if [ $stage -le 10 ]; then
   steps/cleanup/internal/resolve_ctm_edits_overlaps.py \
-    ${data_uniform_seg}/segments $dir/lats/score_$lmwt/ctm_edits $dir/ctm_edits
+    ${data_uniform_seg}/segments $decode_dir/ctm_$lmwt/ctm_edits $dir/ctm_edits
 fi
 
 if [ $stage -le 11 ]; then
@@ -421,7 +420,7 @@ if [ $stage -le 13 ]; then
   --splitting.min-silence-length=$min_silence_length_to_split_at
   --splitting.min-non-scored-length=$min_non_scored_length_to_split_at
   )
-  
+
   $cmd $dir/log/segment_ctm_edits.log \
     steps/cleanup/internal/segment_ctm_edits_mild.py \
       ${segmentation_opts[@]} $segmentation_extra_opts \
diff --git a/egs/wsj/s5/steps/dict/train_g2p.sh b/egs/wsj/s5/steps/dict/train_g2p.sh
index 2e4df49b71b..d793bbb5d8f 100755
--- a/egs/wsj/s5/steps/dict/train_g2p.sh
+++ b/egs/wsj/s5/steps/dict/train_g2p.sh
@@ -44,8 +44,9 @@ mkdir -p $wdir/log
 
 # Optionally remove words that are mapped to a single silence phone from the lexicon.
 if $only_words && [ ! -z "$silence_phones" ]; then
-  awk 'NR==FNR{a[$1] = 1; next} {s=$2;for(i=3;i<=NF;i++) s=s" "$i;a[$1]=s;if(!(s in a)) print $1" "s}' \
-    $silence_phones > $wdir/lexicon_onlywords.txt
+  awk -v s=$silence_phones \
+    'BEGIN{while((getline<s)>0) {for(i=1;i<=NF;i++) sil[$i]=1;}}
+    {if (!(NF == 2 && $2 in sil)) print;}' $lexicon > $wdir/lexicon_onlywords.txt
   lexicon=$wdir/lexicon_onlywords.txt
 fi
 
diff --git a/egs/wsj/s5/steps/get_ctm_fast.sh b/egs/wsj/s5/steps/get_ctm_fast.sh
new file mode 100755
index 00000000000..613061f7df8
--- /dev/null
+++ b/egs/wsj/s5/steps/get_ctm_fast.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
+# Copyright 2017  Vimal Manohar
+
+# This script produces CTM files from a decoding directory that has lattices
+# present.
+# This is similar to get_ctm.sh, but gets the CTM at the utterance-level.
+# It can be faster than steps/get_ctm.sh --use-segments false as it splits
+# the process across many jobs.
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+frame_shift=0.01
+lmwt=10
+print_silence=false
+#end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 [options] <data-dir> <lang-dir|graph-dir> <decode-dir> <ctm-out-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --frame-shift (default=0.01)    # specify this if your lattices have a frame-shift"
+  echo "                                    # not equal to 0.01 seconds"
+  echo "e.g.:"
+  echo "$0 data/train data/lang exp/tri4a/decode/"
+  echo "See also: steps/get_ctm.sh"
+  exit 1;
+fi
+
+data=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+decode_dir=$3
+dir=$4
+
+if [ -f $decode_dir/final.mdl ]; then
+  model=$decode_dir/final.mdl
+else
+  model=$decode_dir/../final.mdl # assume model one level up from decoding dir.
+fi
+
+for f in $lang/words.txt $model $decode_dir/lat.1.gz; do
+  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+done
+
+mkdir -p $dir
+
+nj=$(cat $decode_dir/num_jobs)
+echo $nj > $dir/num_jobs
+
+if [ -f $lang/phones/word_boundary.int ]; then
+  $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
+    set -o pipefail '&&' \
+    lattice-1best --lm-scale=$lmwt "ark:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \
+    lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
+    nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
+    utils/int2sym.pl -f 5 $lang/words.txt \
+    '>' $dir/ctm.JOB || exit 1;
+elif [ -f $lang/phones/align_lexicon.int ]; then
+  $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
+    set -o pipefail '&&' \
+    lattice-1best --lm-scale=$lmwt "ark:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \
+    lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
+    lattice-1best ark:- ark:- \| \
+    nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
+    utils/int2sym.pl -f 5 $lang/words.txt \
+    '>' $dir/ctm.JOB || exit 1;
+else
+  echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
+  exit 1;
+fi
+
+for n in `seq $nj`; do 
+  cat $dir/ctm.$n
+done > $dir/ctm
diff --git a/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh b/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
index 0d6713d52b8..5db6be731ce 100755
--- a/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
+++ b/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
@@ -1,6 +1,6 @@
 #! /bin/bash
 
-# Copyright 2016  Vimal Manohar
+# Copyright 2016-2018  Vimal Manohar
 # Apache 2.0
 
 # This scripts converts a data directory into a "whole" data directory
@@ -11,9 +11,7 @@ set -o pipefail
 
 . ./path.sh
 
-cmd=run.pl
-
-. parse_options.sh
+. utils/parse_options.sh
 
 if [ $# -ne 2 ]; then
   echo "Usage: convert_data_dir_to_whole.sh <in-data> <out-data>"
@@ -32,75 +30,27 @@ fi
 
 mkdir -p $dir
 cp $data/wav.scp $dir
-cp $data/reco2file_and_channel $dir
-rm -f $dir/{utt2spk,text} || true
+if [ -f $data/reco2file_and_channel ]; then 
+  cp $data/reco2file_and_channel $dir; 
+fi
+
+mkdir -p $dir/.backup
+mv $dir/feats.scp $dir/cmvn.scp $dir/.backup
+
+rm $dir/utt2spk || true
 
 [ -f $data/stm ] && cp $data/stm $dir
 [ -f $data/glm ] && cp $data/glm $dir
 
-text_files=
-[ -f $data/text ] && text_files="$data/text $dir/text"
-
-# Combine utt2spk and text from the segments into utt2spk and text for the whole
-# recording.
-cat $data/segments | perl -e '
-if (scalar @ARGV == 3) {
-  ($utt2spk_in, $text_in, $text_out) = @ARGV;
-} elsif (scalar @ARGV == 1) {
-  $utt2spk_in = $ARGV[0];
-} else {
-  die "Unexpected number of arguments";
-}
-
-if (defined $text_in) {
-  open(TI, "<$text_in") || die "Error: fail to open $text_in\n";
-  open(TO, ">$text_out") || die "Error: fail to open $text_out\n";
-}
-open(UI, "<$utt2spk_in") || die "Error: fail to open $utt2spk_in\n";
-
-my %file2utt = ();
-while (<STDIN>) {
-  chomp;
-  my @col = split;
-  @col >= 4 or die "bad line $_\n";
-
-  if (! defined $file2utt{$col[1]}) {
-    $file2utt{$col[1]} = [];
-  }
-  push @{$file2utt{$col[1]}}, $col[0]; 
-}
-
-my %text = ();
-my %utt2spk = ();
-
-while (<UI>) {
-  chomp; 
-  my @col = split;
-  $utt2spk{$col[0]} = $col[1];
-}
-
-if (defined $text_in) {
-  while (<TI>) {
-    chomp;
-    my @col = split;
-    @col >= 1 or die "bad line $_\n";
-
-    my $utt = shift @col;
-    $text{$utt} = join(" ", @col);
-  }
-}
-
-foreach $file (keys %file2utt) {
-  my @utts = @{$file2utt{$file}};
-  print "$file $file\n";
-
-  if (defined $text_in) {
-    $text_line = "";
-    print TO "$file $text_line\n";
-  }
-}
-' $data/utt2spk $text_files > $dir/utt2spk
-
-utils/spk2utt_to_utt2spk.pl $dir/utt2spk > $dir/spk2utt
-
-utils/fix_data_dir.sh $dir
+utils/data/internal/combine_segments_to_recording.py \
+  --write-reco2utt=$dir/reco2sorted_utts $data/segments $dir/utt2spk || exit 1
+
+if [ -f $data/text ]; then
+  utils/apply_map.pl -f 2 $data/text < $dir/reco2sorted_utts > $dir/text || exit 1
+fi
+
+rm $dir/reco2sorted_utts
+
+utils/fix_data_dir.sh $dir || exit 1
+
+exit 0
diff --git a/egs/wsj/s5/utils/data/get_utt2dur.sh b/egs/wsj/s5/utils/data/get_utt2dur.sh
index c415e8dfb81..e6a344d7d50 100755
--- a/egs/wsj/s5/utils/data/get_utt2dur.sh
+++ b/egs/wsj/s5/utils/data/get_utt2dur.sh
@@ -11,6 +11,8 @@
 # files in entirely.)
 
 frame_shift=0.01
+cmd=run.pl
+nj=4
 
 . utils/parse_options.sh
 . ./path.sh
@@ -80,11 +82,17 @@ elif [ -f $data/wav.scp ]; then
       echo "... perturb_data_dir_speed_3way.sh."
     fi
 
-    if ! wav-to-duration --read-entire-file=$read_entire_file scp:$data/wav.scp ark,t:$data/utt2dur 2>&1 | grep -v 'nonzero return status'; then
-      echo "$0: there was a problem getting the durations; moving $data/utt2dur to $data/.backup/"
-      mkdir -p $data/.backup/
-      mv $data/utt2dur $data/.backup/
-    fi
+    utils/data/split_data.sh --per-utt $data $nj
+    sdata=$data/split${nj}utt
+
+    $cmd JOB=1:$nj $data/log/get_durations.JOB.log \
+      wav-to-duration --read-entire-file=$read_entire_file \
+      scp:$sdata/JOB/wav.scp ark,t:$sdata/JOB/utt2dur || \
+        { echo "$0: there was a problem getting the durations"; exit 1; }
+
+    for n in `seq $nj`; do
+      cat $sdata/$n/utt2dur
+    done > $data/utt2dur
   fi
 elif [ -f $data/feats.scp ]; then
   echo "$0: wave file does not exist so getting durations from feats files"
diff --git a/egs/wsj/s5/utils/data/internal/combine_segments_to_recording.py b/egs/wsj/s5/utils/data/internal/combine_segments_to_recording.py
new file mode 100755
index 00000000000..8d810c68fe1
--- /dev/null
+++ b/egs/wsj/s5/utils/data/internal/combine_segments_to_recording.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+# Copyright 2018  Vimal Manohar
+# Apache 2.0
+
+from __future__ import print_function
+import argparse
+import sys
+import collections
+from collections import defaultdict
+
+def get_args():
+    parser = argparse.ArgumentParser(description="""
+        This script combines segments into utterances at
+        recording-level and write out new utt2spk file with reco-id as the
+        speakers. If --write-reco2utt is provided, it writes a mapping from
+        recording-id to the list of utterances sorted by start and end times.
+        This map can be used to combine text corresponding to the segments to
+        recording-level.""")
+
+    parser.add_argument("--write-reco2utt", help="If provided, writes a "
+                        "mapping from recording-id to list of utterances "
+                        "sorted by start and end times.")
+    parser.add_argument("segments_in", help="Input segments file")
+    parser.add_argument("utt2spk_out", help="Output utt2spk file")
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = get_args()
+
+    utt2reco = {}
+    segments_for_reco = defaultdict(list)
+    for line in open(args.segments_in):
+        parts = line.strip().split()
+
+        if len(parts) < 4:
+            raise TypeError("bad line in segments file {}".format(line))
+
+        utt = parts[0]
+        reco = parts[1]
+        start_time = parts[2]
+        end_time = parts[3]
+
+        segments_for_reco[reco].append((utt, start_time, end_time))
+        utt2reco[utt] = reco
+
+    if args.write_reco2utt is not None:
+        with open(args.write_reco2utt, 'w') as reco2utt_writer, \
+                open(args.utt2spk_out, 'w') as utt2spk_writer:
+            for reco, segments_in_reco in segments_for_reco.items():
+                utts = ' '.join([seg[0] for seg in sorted(
+                    segments_in_reco, key=lambda x:(x[1], x[2]))])
+                print("{0} {1}".format(reco, utts), file=reco2utt_writer)
+                print ("{0} {0}".format(reco), file=utt2spk_writer)
+    else:
+        with open(args.utt2spk_out, 'w') as utt2spk_writer:
+            for reco in segments_for_reco.keys():
+                print ("{0} {0}".format(reco), file=utt2spk_writer)
+
+
+if __name__ == "__main__":
+    main()